From 1d078743efcb85ddace880d930a4ed95772aaf9b Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas@kluyver.me.uk>
Date: Thu, 5 Oct 2023 15:54:42 +0100
Subject: [PATCH 1/7] Expose .units and .units_name on KeyData objects

---
 extra_data/keydata.py            | 29 +++++++++++++++++++++++++++--
 extra_data/tests/mockdata/xgm.py | 14 ++++++++++++++
 extra_data/tests/test_keydata.py | 11 +++++++++++
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/extra_data/keydata.py b/extra_data/keydata.py
index b121c113..b029c1bd 100644
--- a/extra_data/keydata.py
+++ b/extra_data/keydata.py
@@ -106,6 +106,28 @@ def size_gb(self):
         """The size of the data in memory in gigabytes."""
         return self.nbytes / 1e9
 
+    @property
+    def units(self):
+        dset = self.files[0].file[self.hdf5_data_path]
+        base_unit = dset.attrs.get('unitSymbol', None)
+        if base_unit is None:
+            return None
+
+        prefix = dset.attrs.get('metricPrefixSymbol', '')
+        if prefix == 'u':
+            prefix = 'μ'  # We are not afraid of unicode
+        return prefix + base_unit
+
+    @property
+    def units_name(self):
+        dset = self.files[0].file[self.hdf5_data_path]
+        base_unit = dset.attrs.get('unitName', None)
+        if base_unit is None:
+            return None
+
+        prefix = dset.attrs.get('metricPrefixName', '')
+        return prefix + base_unit
+
     @property
     def source_file_paths(self):
         paths = []
@@ -144,8 +166,11 @@ def __getitem__(self, item):
 
     def _only_tids(self, tids):
         tids_arr = np.array(tids)
-        files = [f for f in self.files
-                 if f.has_train_ids(tids_arr, self.inc_suspect_trains)]
+        # Keep 1 file, even if 0 trains selected.
+        files = [
+            f for f in self.files
+            if f.has_train_ids(tids_arr, self.inc_suspect_trains)
+        ] or [self.files[0]]
 
         return KeyData(
             self.source,
diff --git a/extra_data/tests/mockdata/xgm.py b/extra_data/tests/mockdata/xgm.py
index 5f247474..8c972240 100644
--- a/extra_data/tests/mockdata/xgm.py
+++ b/extra_data/tests/mockdata/xgm.py
@@ -1,3 +1,5 @@
+import numpy as np
+
 from .base import DeviceBase
 
 class XGM(DeviceBase):
@@ -53,3 +55,15 @@ class XGM(DeviceBase):
         ('xTD', 'f4', (1000,)),
         ('yTD', 'f4', (1000,)),
     ]
+
+    def write_instrument(self, f):
+        super().write_instrument(f)
+
+        # Annotate intensityTD with some units to test retrieving them
+        ds = f[f'INSTRUMENT/{self.device_id}:output/data/intensityTD']
+        ds.attrs['metricPrefixEnum']= np.array([14], dtype=np.int32)
+        ds.attrs['metricPrefixName'] = 'micro'
+        ds.attrs['metricPrefixSymbol'] = b'u'
+        ds.attrs['unitEnum'] = [15]
+        ds.attrs['unitName'] = b'joule'
+        ds.attrs['unitSymbol'] = b'J'
diff --git a/extra_data/tests/test_keydata.py b/extra_data/tests/test_keydata.py
index d2a9147a..f1ec4808 100644
--- a/extra_data/tests/test_keydata.py
+++ b/extra_data/tests/test_keydata.py
@@ -339,3 +339,14 @@ def test_file_no_trains(run_with_file_no_trains):
     run = RunDirectory(run_with_file_no_trains)
     xpos = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos'].ndarray()
     assert xpos.shape == (64,)
+
+
+def test_units(mock_sa3_control_data):
+    run = H5File(mock_sa3_control_data)
+    xgm_intensity = run['SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensityTD']
+
+    assert xgm_intensity.units == 'μJ'
+    assert xgm_intensity.units_name == 'microjoule'
+
+    # Check that it still works after selecting 0 trains
+    assert xgm_intensity.select_trains(np.s_[:0]).units == 'μJ'

From 4b1e302da0a0e28f9eda8cb7dace6a97063c22d3 Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas@kluyver.me.uk>
Date: Thu, 5 Oct 2023 16:00:12 +0100
Subject: [PATCH 2/7] Write attributes more carefully for tests

---
 extra_data/tests/mockdata/xgm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/extra_data/tests/mockdata/xgm.py b/extra_data/tests/mockdata/xgm.py
index 8c972240..d25f7372 100644
--- a/extra_data/tests/mockdata/xgm.py
+++ b/extra_data/tests/mockdata/xgm.py
@@ -60,10 +60,11 @@ def write_instrument(self, f):
         super().write_instrument(f)
 
         # Annotate intensityTD with some units to test retrieving them
+        # Karabo stores ASCII strings, assigning bytes is a shortcut to mimic that
         ds = f[f'INSTRUMENT/{self.device_id}:output/data/intensityTD']
         ds.attrs['metricPrefixEnum']= np.array([14], dtype=np.int32)
-        ds.attrs['metricPrefixName'] = 'micro'
+        ds.attrs['metricPrefixName'] = b'micro'
         ds.attrs['metricPrefixSymbol'] = b'u'
-        ds.attrs['unitEnum'] = [15]
+        ds.attrs['unitEnum'] = np.array([15], dtype=np.int32)
         ds.attrs['unitName'] = b'joule'
         ds.attrs['unitSymbol'] = b'J'

From 6a1c963186255d16a437554a0cf7fab4bc6716bc Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas@kluyver.me.uk>
Date: Thu, 5 Oct 2023 16:16:37 +0100
Subject: [PATCH 3/7] Update test for selecting 0 trains

---
 extra_data/tests/test_keydata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extra_data/tests/test_keydata.py b/extra_data/tests/test_keydata.py
index f1ec4808..d69b81e0 100644
--- a/extra_data/tests/test_keydata.py
+++ b/extra_data/tests/test_keydata.py
@@ -49,7 +49,7 @@ def test_select_trains(mock_spb_raw_run):
     # Empty selection
     sel2 = xgm_beam_x[80:]
     assert sel2.shape == (0,)
-    assert len(sel2.files) == 0
+    assert len(sel2.files) == 1
     assert sel2.xarray().shape == (0,)
 
     # Single train

From d3fa46cfd7f1c151dc768209593723cf9f35edd0 Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas@kluyver.me.uk>
Date: Mon, 9 Oct 2023 14:44:01 +0100
Subject: [PATCH 4/7] Look in source file for dataset attributes if required

---
 extra_data/keydata.py            | 25 +++++++++++++++++++------
 extra_data/tests/test_keydata.py | 10 ++++++++++
 extra_data/tests/test_voview.py  |  2 ++
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/extra_data/keydata.py b/extra_data/keydata.py
index b029c1bd..0c6fb381 100644
--- a/extra_data/keydata.py
+++ b/extra_data/keydata.py
@@ -1,5 +1,6 @@
 from typing import List, Optional, Tuple
 
+import h5py
 import numpy as np
 
 from .exceptions import TrainIDError, NoDataError
@@ -108,24 +109,24 @@ def size_gb(self):
 
     @property
     def units(self):
-        dset = self.files[0].file[self.hdf5_data_path]
-        base_unit = dset.attrs.get('unitSymbol', None)
+        attrs = self.attributes()
+        base_unit = attrs.get('unitSymbol', None)
         if base_unit is None:
             return None
 
-        prefix = dset.attrs.get('metricPrefixSymbol', '')
+        prefix = attrs.get('metricPrefixSymbol', '')
         if prefix == 'u':
             prefix = 'μ'  # We are not afraid of unicode
         return prefix + base_unit
 
     @property
     def units_name(self):
-        dset = self.files[0].file[self.hdf5_data_path]
-        base_unit = dset.attrs.get('unitName', None)
+        attrs = self.attributes()
+        base_unit = attrs.get('unitName', None)
         if base_unit is None:
             return None
 
-        prefix = dset.attrs.get('metricPrefixName', '')
+        prefix = attrs.get('metricPrefixName', '')
         return prefix + base_unit
 
     @property
@@ -151,6 +152,18 @@ def source_file_paths(self):
         from pathlib import Path
         return [Path(p) for p in paths]
 
+    def attributes(self):
+        dset = self.files[0].file[self.hdf5_data_path]
+        if len(dset.attrs) == 0 and dset.is_virtual:
+            # Virtual datasets were initially created without these attributes.
+            # Find a source file. Not using source_file_paths as it can give [].
+            _, filename, _, _ = dset.virtual_sources()[0]
+            # Not using FileAccess: no need for train or source lists.
+            f = h5py.File(filename, 'r')
+            dset = f[self.hdf5_data_path]
+
+        return dict(dset.attrs)
+
     def select_trains(self, trains):
         """Select a subset of trains in this data as a new :class:`KeyData` object.
 
diff --git a/extra_data/tests/test_keydata.py b/extra_data/tests/test_keydata.py
index d69b81e0..c9f00f8a 100644
--- a/extra_data/tests/test_keydata.py
+++ b/extra_data/tests/test_keydata.py
@@ -341,6 +341,16 @@ def test_file_no_trains(run_with_file_no_trains):
     assert xpos.shape == (64,)
 
 
+def test_attributes(mock_sa3_control_data):
+    run = H5File(mock_sa3_control_data)
+    xgm_intensity = run['SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensityTD']
+    attrs = xgm_intensity.attributes()
+
+    assert isinstance(attrs, dict)
+    assert attrs['metricPrefixName'] == 'micro'
+    assert attrs['unitSymbol'] == 'J'
+
+
 def test_units(mock_sa3_control_data):
     run = H5File(mock_sa3_control_data)
     xgm_intensity = run['SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensityTD']
diff --git a/extra_data/tests/test_voview.py b/extra_data/tests/test_voview.py
index cbdfbe1d..60df0d33 100644
--- a/extra_data/tests/test_voview.py
+++ b/extra_data/tests/test_voview.py
@@ -63,6 +63,8 @@ def test_use_voview(mock_spb_raw_run, tmp_path):
     assert {p.name for p in xgm_intens[:30].source_file_paths} == {
         'RAW-R0238-DA01-S00000.h5'
     }
+    assert xgm_intens.units == 'μJ'
+    assert xgm_intens.units_name == 'microjoule'
 
 
 

From a8d7e70a87c9babde64ae336ab5e33be9cc0983a Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas@kluyver.me.uk>
Date: Mon, 9 Oct 2023 15:53:46 +0100
Subject: [PATCH 5/7] Use 'with h5py.File()' to ensure file is closed
 afterwards

---
 extra_data/keydata.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/extra_data/keydata.py b/extra_data/keydata.py
index 0c6fb381..5b0d855a 100644
--- a/extra_data/keydata.py
+++ b/extra_data/keydata.py
@@ -153,14 +153,20 @@ def source_file_paths(self):
         return [Path(p) for p in paths]
 
     def attributes(self):
+        """Get a dict of all attributes stored with this data
+
+        This may be awkward to use. See .units and .units_name for more
+        convenient forms.
+        """
         dset = self.files[0].file[self.hdf5_data_path]
-        if len(dset.attrs) == 0 and dset.is_virtual:
+        attrs = dict(dset.attrs)
+        if (not attrs) and dset.is_virtual:
             # Virtual datasets were initially created without these attributes.
             # Find a source file. Not using source_file_paths as it can give [].
             _, filename, _, _ = dset.virtual_sources()[0]
             # Not using FileAccess: no need for train or source lists.
-            f = h5py.File(filename, 'r')
-            dset = f[self.hdf5_data_path]
+            with h5py.File(filename, 'r') as f:
+                attrs = dict(f[self.hdf5_data_path].attrs)
 
         return dict(dset.attrs)
 

From 5d0d95de4f447313d2bbb82b0a53a5bf4d3be313 Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas@kluyver.me.uk>
Date: Mon, 9 Oct 2023 15:53:55 +0100
Subject: [PATCH 6/7] Document units & units_name

---
 docs/reading_files.rst | 4 ++++
 extra_data/keydata.py  | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/docs/reading_files.rst b/docs/reading_files.rst
index 1939c6f4..31b9f639 100644
--- a/docs/reading_files.rst
+++ b/docs/reading_files.rst
@@ -221,6 +221,10 @@ below, e.g.::
    
       .. versionadded:: 1.9
 
+   .. autoattribute:: units
+
+   .. autoattribute:: units_name
+
 
 The run or file object (a :class:`DataCollection`) also has methods to load
 data by sources and keys. :meth:`get_array`, :meth:`get_dask_array` and
diff --git a/extra_data/keydata.py b/extra_data/keydata.py
index 5b0d855a..2931785d 100644
--- a/extra_data/keydata.py
+++ b/extra_data/keydata.py
@@ -109,6 +109,7 @@ def size_gb(self):
 
     @property
     def units(self):
+        """The units symbol for this data, e.g. 'μJ', or None if not found"""
         attrs = self.attributes()
         base_unit = attrs.get('unitSymbol', None)
         if base_unit is None:
@@ -121,6 +122,7 @@ def units(self):
 
     @property
     def units_name(self):
+        """The units name for this data, e.g. 'microjoule', or None if not found"""
         attrs = self.attributes()
         base_unit = attrs.get('unitName', None)
         if base_unit is None:

From 5926e94203ede93696ce893f31a44a06e9e295e5 Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas@kluyver.me.uk>
Date: Mon, 9 Oct 2023 16:53:38 +0100
Subject: [PATCH 7/7] Fix attributes() method with virtual datasets

---
 extra_data/keydata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extra_data/keydata.py b/extra_data/keydata.py
index 2931785d..69ac3223 100644
--- a/extra_data/keydata.py
+++ b/extra_data/keydata.py
@@ -170,7 +170,7 @@ def attributes(self):
             with h5py.File(filename, 'r') as f:
                 attrs = dict(f[self.hdf5_data_path].attrs)
 
-        return dict(dset.attrs)
+        return attrs
 
     def select_trains(self, trains):
         """Select a subset of trains in this data as a new :class:`KeyData` object.