Merge pull request #314 from OpenCOMPES/SXP_beamtime_additions

Sxp beamtime additions
OpenCOMPES · Dec 24, 2023 · 1e1157e · 1e1157e
2 parents ab48cf5 + df65e7f
commit 1e1157e
Show file tree

Hide file tree

Showing 5 changed files with 327 additions and 108 deletions.
diff --git a/sed/loader/sxp/loader.py b/sed/loader/sxp/loader.py
@@ -51,7 +51,7 @@ def __init__(self, config: dict) -> None:
         self.index_per_electron: MultiIndex = None
         self.index_per_pulse: MultiIndex = None
         self.failed_files_error: List[str] = []
-        self.nhits: np.ndarray = None
+        self.array_indices: List[List[slice]] = None
 
     def initialize_paths(self) -> Tuple[List[Path], Path]:
         """
@@ -132,7 +132,7 @@ def get_files_from_run_id(
         stream_name_prefixes = self._config["dataframe"]["stream_name_prefixes"]
         stream_name_postfixes = self._config["dataframe"].get("stream_name_postfixes", {})
 
-        if isinstance(run_id, int):
+        if isinstance(run_id, (int, np.integer)):
             run_id = str(run_id).zfill(4)
 
         if folders is None:
@@ -172,6 +172,7 @@ def available_channels(self) -> List:
         excluding pulseId, defined by the json file"""
         available_channels = list(self._config["dataframe"]["channels"].keys())
         available_channels.remove("pulseId")
+        available_channels.remove("trainId")
         return available_channels
 
     def get_channels(self, formats: Union[str, List[str]] = "", index: bool = False) -> List[str]:
@@ -220,7 +221,7 @@ def reset_multi_index(self) -> None:
         """Resets the index per pulse and electron"""
         self.index_per_electron = None
         self.index_per_pulse = None
-        self.nhits = None
+        self.array_indices = None
 
     def create_multi_index_per_electron(self, h5_file: h5py.File) -> None:
         """
@@ -237,30 +238,50 @@ def create_multi_index_per_electron(self, h5_file: h5py.File) -> None:
                 as the index levels.
         """
 
-        # Macrobunch IDs obtained from the pulseId channel
-        train_id, np_array = self.create_numpy_array_per_channel(
+        # relative macrobunch IDs obtained from the trainId channel
+        train_id, mab_array = self.create_numpy_array_per_channel(
             h5_file,
-            "pulseId",
+            "trainId",
         )
-        # change trailing zeros to nan
-        nhits = (
-            np_array.shape[1]
-            - np.argmax((np.diff(np_array.astype(np.int32)) < 0)[:, ::-1], axis=1)
-            - 1
+        # Internal microbunch IDs obtained from the pulseId channel
+        train_id, mib_array = self.create_numpy_array_per_channel(
+            h5_file,
+            "pulseId",
         )
-        nhits[nhits == np_array.shape[1] - 1] = 0
-        self.nhits = nhits
-        np_array = np_array.astype("float")
-        for i in range(np_array.shape[0]):
-            np_array[i, nhits[i] :] = np.nan
 
+        # Chopping data into trains
+        macrobunch_index = []
+        microbunch_ids = []
+        macrobunch_indices = []
+        for i in train_id.index:
+            # removing broken trailing hit copies
+            num_trains = self._config["dataframe"].get("num_trains", 0)
+            if num_trains:
+                try:
+                    num_valid_hits = np.where(np.diff(mib_array[i].astype(np.int32)) < 0)[0][
+                        num_trains - 1
+                    ]
+                    mab_array[i, num_valid_hits:] = 0
+                    mib_array[i, num_valid_hits:] = 0
+                except IndexError:
+                    pass
+            train_ends = np.where(np.diff(mib_array[i].astype(np.int32)) < -1)[0]
+            indices = []
+            index = 0
+            for train, train_end in enumerate(train_ends):
+                macrobunch_index.append(train_id[i] + np.uint(train))
+                microbunch_ids.append(mib_array[i, index:train_end])
+                indices.append(slice(index, train_end))
+                index = train_end + 1
+            macrobunch_indices.append(indices)
+        self.array_indices = macrobunch_indices
         # Create a series with the macrobunches as index and
         # microbunches as values
         macrobunches = (
             Series(
-                (np_array[i] for i in train_id.index),
+                (microbunch_ids[i] for i in range(len(macrobunch_index))),
                 name="pulseId",
-                index=train_id,
+                index=macrobunch_index,
             )
             - self._config["dataframe"]["ubid_offset"]
         )
@@ -358,20 +379,22 @@ def create_numpy_array_per_channel(
                 channel_dict["slice"],
                 axis=1,
             )
+
+        if "scale" in channel_dict:
+            np_array = np_array / float(channel_dict["scale"])
+
         return train_id, np_array
 
     def create_dataframe_per_electron(
         self,
         np_array: np.ndarray,
-        train_id: Series,
         channel: str,
     ) -> DataFrame:
         """
         Returns a pandas DataFrame for a given channel name of type [per electron].
 
         Args:
             np_array (np.ndarray): The numpy array containing the channel data.
-            train_id (Series): The train ID Series.
             channel (str): The name of the channel.
 
         Returns:
@@ -382,13 +405,16 @@ def create_dataframe_per_electron(
             is set, and the NaN values are dropped, alongside the pulseId = 0 (meaningless).
 
         """
-        if self.nhits is None or self.nhits.shape[0] != np_array.shape[0]:
-            raise RuntimeError("nhits not set correctly, internal inconstency detected.")
-        np_array = np_array.astype("float")
-        for i in range(np_array.shape[0]):
-            np_array[i, self.nhits[i] :] = np.nan
+        if self.array_indices is None or len(self.array_indices) != np_array.shape[0]:
+            raise RuntimeError(
+                "macrobunch_indices not set correctly, internal inconstency detected.",
+            )
+        train_data = []
+        for i, _ in enumerate(self.array_indices):
+            for indices in self.array_indices[i]:
+                train_data.append(np_array[i, indices])
         return (
-            Series((np_array[i] for i in train_id.index), name=channel)
+            Series((train for train in train_data), name=channel)
             .explode()
             .dropna()
             .to_frame()
@@ -530,7 +556,6 @@ def create_dataframe_per_channel(
             # Create a DataFrame for electron-resolved data
             data = self.create_dataframe_per_electron(
                 np_array,
-                train_id,
                 channel,
             )
 

diff --git a/tests/data/loader/sxp/config.yaml b/tests/data/loader/sxp/config.yaml
@@ -53,6 +53,11 @@ dataframe:
       dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/starterCounter"
       index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/trainId"
       max_hits: 500
+    trainId:
+      format: per_electron
+      dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/masterCounter"
+      index_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/trainId"
+      max_hits: 500
     dldPosX:
       format: per_electron
       dataset_key: "/INSTRUMENT/SXP_TR_XPES/DET/MCP_DETECTOR:output/data/x"