From 9f572981fb7c91c22c993dd591a132d909db01b5 Mon Sep 17 00:00:00 2001
From: Joern Weissenborn <joern.weissenborn@gmail.com>
Date: Thu, 23 Feb 2023 23:31:22 +0100
Subject: [PATCH 1/3] Added basic data pre-processing pipeline.

---
 glotaran/io/preprocessor/__init__.py          |  2 +
 glotaran/io/preprocessor/pipeline.py          | 83 +++++++++++++++++++
 glotaran/io/preprocessor/preprocessor.py      | 76 +++++++++++++++++
 .../io/preprocessor/test/test_preprocessor.py | 35 ++++++++
 requirements_dev.txt                          |  1 +
 setup.cfg                                     |  1 +
 6 files changed, 198 insertions(+)
 create mode 100644 glotaran/io/preprocessor/__init__.py
 create mode 100644 glotaran/io/preprocessor/pipeline.py
 create mode 100644 glotaran/io/preprocessor/preprocessor.py
 create mode 100644 glotaran/io/preprocessor/test/test_preprocessor.py

diff --git a/glotaran/io/preprocessor/__init__.py b/glotaran/io/preprocessor/__init__.py
new file mode 100644
index 000000000..f419c7669
--- /dev/null
+++ b/glotaran/io/preprocessor/__init__.py
@@ -0,0 +1,2 @@
+"""Tools for data pre-processing."""
+from glotaran.io.preprocessor.pipeline import PreProcessingPipeline
diff --git a/glotaran/io/preprocessor/pipeline.py b/glotaran/io/preprocessor/pipeline.py
new file mode 100644
index 000000000..530831d88
--- /dev/null
+++ b/glotaran/io/preprocessor/pipeline.py
@@ -0,0 +1,83 @@
+"""A pre-processor pipeline for data."""
+from __future__ import annotations
+
+from typing import Annotated
+
+import xarray as xr
+from pydantic import BaseModel
+from pydantic import Field
+
+from glotaran.io.preprocessor.preprocessor import CorrectBaselineAverage
+from glotaran.io.preprocessor.preprocessor import CorrectBaselineValue
+
+PipelineAction = Annotated[
+    CorrectBaselineValue | CorrectBaselineAverage,
+    Field(discriminator="action"),
+]
+
+
+class PreProcessingPipeline(BaseModel):
+    """A pipeline for pre-processors."""
+
+    actions: list[PipelineAction] = Field(default_factory=list)
+
+    def apply(self, original: xr.DataArray) -> xr.DataArray:
+        """Apply all pre-processors on data.
+
+        Parameters
+        ----------
+        original: xr.DataArray
+            The data to process.
+
+        Returns
+        -------
+        xr.DataArray
+        """
+        result = original.copy()
+
+        for action in self.actions:
+            result = action.apply(result)
+        return result
+
+    def _push_action(self, action: PipelineAction):
+        """Push an action.
+
+        Parameters
+        ----------
+        action: PipelineAction
+            The action to push.
+        """
+        self.actions.append(action)
+
+    def correct_baseline_value(self, value: float) -> PreProcessingPipeline:
+        """Correct a dataset by subtracting baseline value.
+
+        Parameters
+        ----------
+        value: float
+            The value to subtract.
+
+        Returns
+        -------
+        PreProcessingPipeline
+        """
+        self._push_action(CorrectBaselineValue(value=value))
+        return self
+
+    def correct_baseline_average(
+        self, selection: dict[str, slice | list[int] | int]
+    ) -> PreProcessingPipeline:
+        """Correct a dataset by subtracting the average over a part of the data.
+
+        Parameters
+        ----------
+        selection: dict[str, slice | list[int] | int]
+            The selection to average as dictionary of dimension and indexer.
+            The indexer can be a slice, a list or an integer value.
+
+        Returns
+        -------
+        PreProcessingPipeline
+        """
+        self._push_action(CorrectBaselineAverage(selection=selection))
+        return self
diff --git a/glotaran/io/preprocessor/preprocessor.py b/glotaran/io/preprocessor/preprocessor.py
new file mode 100644
index 000000000..7832dd759
--- /dev/null
+++ b/glotaran/io/preprocessor/preprocessor.py
@@ -0,0 +1,76 @@
+"""A pre-processor pipeline for data."""
+from __future__ import annotations
+
+import abc
+from typing import Literal
+
+import xarray as xr
+from pydantic import BaseModel
+
+
+class PreProcessor(BaseModel, abc.ABC):
+    """A base class for pre=processors."""
+
+    class Config:
+        """Config for BaseModel."""
+
+        arbitrary_types_allowed = True
+
+    @abc.abstractmethod
+    def apply(self, data: xr.DataArray) -> xr.DataArray:
+        """Apply the pre-processor.
+
+        Parameters
+        ----------
+        data: xr.DataArray
+            The data to process.
+
+        Returns
+        -------
+        xr.DataArray
+
+        .. # noqa: DAR202
+        """
+
+
+class CorrectBaselineValue(PreProcessor):
+    """Corrects a dataset by subtracting baseline value."""
+
+    action: Literal["baseline-value"] = "baseline-value"
+    value: float
+
+    def apply(self, data: xr.DataArray) -> xr.DataArray:
+        """Apply the pre-processor.
+
+        Parameters
+        ----------
+        data: xr.DataArray
+            The data to process.
+
+        Returns
+        -------
+        xr.DataArray
+        """
+        return data - self.value
+
+
+class CorrectBaselineAverage(PreProcessor):
+    """Corrects a dataset by subtracting the average over a part of the data."""
+
+    action: Literal["baseline-average"] = "baseline-average"
+    selection: dict[str, slice | list[int] | int]
+
+    def apply(self, data: xr.DataArray) -> xr.DataArray:
+        """Apply the pre-processor.
+
+        Parameters
+        ----------
+        data: xr.DataArray
+            The data to process.
+
+        Returns
+        -------
+        xr.DataArray
+        """
+        selection = data.sel(self.selection)
+        return data - (selection.sum() / selection.size)
diff --git a/glotaran/io/preprocessor/test/test_preprocessor.py b/glotaran/io/preprocessor/test/test_preprocessor.py
new file mode 100644
index 000000000..e04de2da0
--- /dev/null
+++ b/glotaran/io/preprocessor/test/test_preprocessor.py
@@ -0,0 +1,35 @@
+import pytest
+import xarray as xr
+
+from glotaran.io.preprocessor import PreProcessingPipeline
+
+
+def test_correct_baseline_value():
+    pl = PreProcessingPipeline()
+    pl.correct_baseline_value(1)
+    data = xr.DataArray([[1]])
+    result = pl.apply(data)
+    assert result == data - 1
+
+
+@pytest.mark.parametrize("indexer", (slice(0, 2), [0, 1]))
+def test_correct_baseline_average(indexer: slice | list[int]):
+    pl = PreProcessingPipeline()
+    pl.correct_baseline_average({"dim_0": 0, "dim_1": indexer})
+    data = xr.DataArray([[1.1, 0.9]])
+    result = pl.apply(data)
+    assert (result == data - 1).all()
+
+
+def test_to_from_dict():
+    pl = PreProcessingPipeline()
+    pl.correct_baseline_value(1)
+    pl.correct_baseline_average({"dim_1": slice(0, 2)})
+    pl_dict = pl.dict()
+    assert pl_dict == {
+        "actions": [
+            {"action": "baseline-value", "value": 1.0},
+            {"action": "baseline-average", "selection": {"dim_1": slice(0, 2)}},
+        ]
+    }
+    assert PreProcessingPipeline.parse_obj(pl_dict) == pl
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 5982a1092..6502bdc94 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -12,6 +12,7 @@ numpy==1.23.5
 odfpy==1.4.1
 openpyxl==3.1.1
 pandas==1.5.3
+pydantic==1.10.2
 rich==13.3.1
 ruamel.yaml==0.17.21
 scipy==1.10.1
diff --git a/setup.cfg b/setup.cfg
index b08492677..3080b3f3c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -40,6 +40,7 @@ install_requires =
     odfpy>=1.4.1
     openpyxl>=3.0.10
     pandas>=1.3.4
+    pydantic>=1.10.2
     rich>=10.9.0
     ruamel.yaml>=0.17.17
     scipy>=1.7.2

From c1dbca4465f38ff0b776c5ab6f9becfdbfa5142f Mon Sep 17 00:00:00 2001
From: Joern Weissenborn <joern.weissenborn@gmail.com>
Date: Wed, 1 Mar 2023 19:43:59 +0100
Subject: [PATCH 2/3] Added exclude to correct_baseline_average.

---
 glotaran/io/preprocessor/pipeline.py               | 11 ++++++++---
 glotaran/io/preprocessor/preprocessor.py           |  5 +++--
 glotaran/io/preprocessor/test/test_preprocessor.py | 13 +++++++++++--
 3 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/glotaran/io/preprocessor/pipeline.py b/glotaran/io/preprocessor/pipeline.py
index 530831d88..9b1a14c6f 100644
--- a/glotaran/io/preprocessor/pipeline.py
+++ b/glotaran/io/preprocessor/pipeline.py
@@ -65,19 +65,24 @@ def correct_baseline_value(self, value: float) -> PreProcessingPipeline:
         return self
 
     def correct_baseline_average(
-        self, selection: dict[str, slice | list[int] | int]
+        self,
+        selection: dict[str, slice | list[int] | int] | None = None,
+        exclude: dict[str, slice | list[int] | int] | None = None,
     ) -> PreProcessingPipeline:
         """Correct a dataset by subtracting the average over a part of the data.
 
         Parameters
         ----------
-        selection: dict[str, slice | list[int] | int]
+        selection: dict[str, slice | list[int] | int] | None
             The selection to average as dictionary of dimension and indexer.
             The indexer can be a slice, a list or an integer value.
+        exclude: dict[str, slice | list[int] | int] | None
+            Excluded regions from the average as dictionary of dimension and indexer.
+            The indexer can be a slice, a list or an integer value.
 
         Returns
         -------
         PreProcessingPipeline
         """
-        self._push_action(CorrectBaselineAverage(selection=selection))
+        self._push_action(CorrectBaselineAverage(exclude=exclude, selection=selection))
         return self
diff --git a/glotaran/io/preprocessor/preprocessor.py b/glotaran/io/preprocessor/preprocessor.py
index 7832dd759..fa8514791 100644
--- a/glotaran/io/preprocessor/preprocessor.py
+++ b/glotaran/io/preprocessor/preprocessor.py
@@ -58,7 +58,8 @@ class CorrectBaselineAverage(PreProcessor):
     """Corrects a dataset by subtracting the average over a part of the data."""
 
     action: Literal["baseline-average"] = "baseline-average"
-    selection: dict[str, slice | list[int] | int]
+    selection: dict[str, slice | list[int] | int] | None = None
+    exclude: dict[str, slice | list[int] | int] | None = None
 
     def apply(self, data: xr.DataArray) -> xr.DataArray:
         """Apply the pre-processor.
@@ -72,5 +73,5 @@ def apply(self, data: xr.DataArray) -> xr.DataArray:
         -------
         xr.DataArray
         """
-        selection = data.sel(self.selection)
+        selection = data.sel(self.selection or {}).drop_sel(self.exclude or {})
         return data - (selection.sum() / selection.size)
diff --git a/glotaran/io/preprocessor/test/test_preprocessor.py b/glotaran/io/preprocessor/test/test_preprocessor.py
index e04de2da0..03e9de99e 100644
--- a/glotaran/io/preprocessor/test/test_preprocessor.py
+++ b/glotaran/io/preprocessor/test/test_preprocessor.py
@@ -15,12 +15,21 @@ def test_correct_baseline_value():
 @pytest.mark.parametrize("indexer", (slice(0, 2), [0, 1]))
 def test_correct_baseline_average(indexer: slice | list[int]):
     pl = PreProcessingPipeline()
-    pl.correct_baseline_average({"dim_0": 0, "dim_1": indexer})
+    pl.correct_baseline_average(selection={"dim_0": 0, "dim_1": indexer})
     data = xr.DataArray([[1.1, 0.9]])
     result = pl.apply(data)
     assert (result == data - 1).all()
 
 
+def test_correct_baseline_average_exclude():
+    pl = PreProcessingPipeline()
+    pl.correct_baseline_average(exclude={"dim_1": 1})
+    data = xr.DataArray([[1.1, 0.9]])
+    result = pl.apply(data)
+    print(result)
+    assert (result == data - 1.1).all()
+
+
 def test_to_from_dict():
     pl = PreProcessingPipeline()
     pl.correct_baseline_value(1)
@@ -29,7 +38,7 @@ def test_to_from_dict():
     assert pl_dict == {
         "actions": [
             {"action": "baseline-value", "value": 1.0},
-            {"action": "baseline-average", "selection": {"dim_1": slice(0, 2)}},
+            {"action": "baseline-average", "selection": {"dim_1": slice(0, 2)}, "exclude": None},
         ]
     }
     assert PreProcessingPipeline.parse_obj(pl_dict) == pl

From 9473bf56e48a303d47d4a6ec402643ac8c2da3a7 Mon Sep 17 00:00:00 2001
From: Joern Weissenborn <joern.weissenborn@gmail.com>
Date: Wed, 1 Mar 2023 20:29:02 +0100
Subject: [PATCH 3/3] Tweaks.

---
 glotaran/io/preprocessor/pipeline.py               | 6 +++---
 glotaran/io/preprocessor/preprocessor.py           | 5 ++---
 glotaran/io/preprocessor/test/test_preprocessor.py | 6 +++---
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/glotaran/io/preprocessor/pipeline.py b/glotaran/io/preprocessor/pipeline.py
index 9b1a14c6f..c331b8008 100644
--- a/glotaran/io/preprocessor/pipeline.py
+++ b/glotaran/io/preprocessor/pipeline.py
@@ -66,14 +66,14 @@ def correct_baseline_value(self, value: float) -> PreProcessingPipeline:
 
     def correct_baseline_average(
         self,
-        selection: dict[str, slice | list[int] | int] | None = None,
+        select: dict[str, slice | list[int] | int] | None = None,
         exclude: dict[str, slice | list[int] | int] | None = None,
     ) -> PreProcessingPipeline:
         """Correct a dataset by subtracting the average over a part of the data.
 
         Parameters
         ----------
-        selection: dict[str, slice | list[int] | int] | None
+        select: dict[str, slice | list[int] | int] | None
             The selection to average as dictionary of dimension and indexer.
             The indexer can be a slice, a list or an integer value.
         exclude: dict[str, slice | list[int] | int] | None
@@ -84,5 +84,5 @@ def correct_baseline_average(
         -------
         PreProcessingPipeline
         """
-        self._push_action(CorrectBaselineAverage(exclude=exclude, selection=selection))
+        self._push_action(CorrectBaselineAverage(exclude=exclude, select=select))
         return self
diff --git a/glotaran/io/preprocessor/preprocessor.py b/glotaran/io/preprocessor/preprocessor.py
index fa8514791..6d918a96e 100644
--- a/glotaran/io/preprocessor/preprocessor.py
+++ b/glotaran/io/preprocessor/preprocessor.py
@@ -58,7 +58,7 @@ class CorrectBaselineAverage(PreProcessor):
     """Corrects a dataset by subtracting the average over a part of the data."""
 
     action: Literal["baseline-average"] = "baseline-average"
-    selection: dict[str, slice | list[int] | int] | None = None
+    select: dict[str, slice | list[int] | int] | None = None
     exclude: dict[str, slice | list[int] | int] | None = None
 
     def apply(self, data: xr.DataArray) -> xr.DataArray:
@@ -73,5 +73,4 @@ def apply(self, data: xr.DataArray) -> xr.DataArray:
         -------
         xr.DataArray
         """
-        selection = data.sel(self.selection or {}).drop_sel(self.exclude or {})
-        return data - (selection.sum() / selection.size)
+        return data - data.sel(self.select or {}).drop_sel(self.exclude or {}).mean()
diff --git a/glotaran/io/preprocessor/test/test_preprocessor.py b/glotaran/io/preprocessor/test/test_preprocessor.py
index 03e9de99e..f6a6e283e 100644
--- a/glotaran/io/preprocessor/test/test_preprocessor.py
+++ b/glotaran/io/preprocessor/test/test_preprocessor.py
@@ -15,7 +15,7 @@ def test_correct_baseline_value():
 @pytest.mark.parametrize("indexer", (slice(0, 2), [0, 1]))
 def test_correct_baseline_average(indexer: slice | list[int]):
     pl = PreProcessingPipeline()
-    pl.correct_baseline_average(selection={"dim_0": 0, "dim_1": indexer})
+    pl.correct_baseline_average(select={"dim_0": 0, "dim_1": indexer})
     data = xr.DataArray([[1.1, 0.9]])
     result = pl.apply(data)
     assert (result == data - 1).all()
@@ -23,7 +23,7 @@ def test_correct_baseline_average(indexer: slice | list[int]):
 
 def test_correct_baseline_average_exclude():
     pl = PreProcessingPipeline()
-    pl.correct_baseline_average(exclude={"dim_1": 1})
+    pl.correct_baseline_average(select={"dim_0": 0}, exclude={"dim_1": 1})
     data = xr.DataArray([[1.1, 0.9]])
     result = pl.apply(data)
     print(result)
@@ -38,7 +38,7 @@ def test_to_from_dict():
     assert pl_dict == {
         "actions": [
             {"action": "baseline-value", "value": 1.0},
-            {"action": "baseline-average", "selection": {"dim_1": slice(0, 2)}, "exclude": None},
+            {"action": "baseline-average", "select": {"dim_1": slice(0, 2)}, "exclude": None},
         ]
     }
     assert PreProcessingPipeline.parse_obj(pl_dict) == pl