From 9f572981fb7c91c22c993dd591a132d909db01b5 Mon Sep 17 00:00:00 2001 From: Joern Weissenborn Date: Thu, 23 Feb 2023 23:31:22 +0100 Subject: [PATCH 1/3] Added basic data pre-processing pipeline. --- glotaran/io/preprocessor/__init__.py | 2 + glotaran/io/preprocessor/pipeline.py | 83 +++++++++++++++++++ glotaran/io/preprocessor/preprocessor.py | 76 +++++++++++++++++ .../io/preprocessor/test/test_preprocessor.py | 35 ++++++++ requirements_dev.txt | 1 + setup.cfg | 1 + 6 files changed, 198 insertions(+) create mode 100644 glotaran/io/preprocessor/__init__.py create mode 100644 glotaran/io/preprocessor/pipeline.py create mode 100644 glotaran/io/preprocessor/preprocessor.py create mode 100644 glotaran/io/preprocessor/test/test_preprocessor.py diff --git a/glotaran/io/preprocessor/__init__.py b/glotaran/io/preprocessor/__init__.py new file mode 100644 index 000000000..f419c7669 --- /dev/null +++ b/glotaran/io/preprocessor/__init__.py @@ -0,0 +1,2 @@ +"""Tools for data pre-processing.""" +from glotaran.io.preprocessor.pipeline import PreProcessingPipeline diff --git a/glotaran/io/preprocessor/pipeline.py b/glotaran/io/preprocessor/pipeline.py new file mode 100644 index 000000000..530831d88 --- /dev/null +++ b/glotaran/io/preprocessor/pipeline.py @@ -0,0 +1,83 @@ +"""A pre-processor pipeline for data.""" +from __future__ import annotations + +from typing import Annotated + +import xarray as xr +from pydantic import BaseModel +from pydantic import Field + +from glotaran.io.preprocessor.preprocessor import CorrectBaselineAverage +from glotaran.io.preprocessor.preprocessor import CorrectBaselineValue + +PipelineAction = Annotated[ + CorrectBaselineValue | CorrectBaselineAverage, + Field(discriminator="action"), +] + + +class PreProcessingPipeline(BaseModel): + """A pipeline for pre-processors.""" + + actions: list[PipelineAction] = Field(default_factory=list) + + def apply(self, original: xr.DataArray) -> xr.DataArray: + """Apply all pre-processors on data. + + Parameters + ---------- + original: xr.DataArray + The data to process. + + Returns + ------- + xr.DataArray + """ + result = original.copy() + + for action in self.actions: + result = action.apply(result) + return result + + def _push_action(self, action: PipelineAction): + """Push an action. + + Parameters + ---------- + action: PipelineAction + The action to push. + """ + self.actions.append(action) + + def correct_baseline_value(self, value: float) -> PreProcessingPipeline: + """Correct a dataset by subtracting baseline value. + + Parameters + ---------- + value: float + The value to subtract. + + Returns + ------- + PreProcessingPipeline + """ + self._push_action(CorrectBaselineValue(value=value)) + return self + + def correct_baseline_average( + self, selection: dict[str, slice | list[int] | int] + ) -> PreProcessingPipeline: + """Correct a dataset by subtracting the average over a part of the data. + + Parameters + ---------- + selection: dict[str, slice | list[int] | int] + The selection to average as dictionary of dimension and indexer. + The indexer can be a slice, a list or an integer value. + + Returns + ------- + PreProcessingPipeline + """ + self._push_action(CorrectBaselineAverage(selection=selection)) + return self diff --git a/glotaran/io/preprocessor/preprocessor.py b/glotaran/io/preprocessor/preprocessor.py new file mode 100644 index 000000000..7832dd759 --- /dev/null +++ b/glotaran/io/preprocessor/preprocessor.py @@ -0,0 +1,76 @@ +"""A pre-processor pipeline for data.""" +from __future__ import annotations + +import abc +from typing import Literal + +import xarray as xr +from pydantic import BaseModel + + +class PreProcessor(BaseModel, abc.ABC): + """A base class for pre=processors.""" + + class Config: + """Config for BaseModel.""" + + arbitrary_types_allowed = True + + @abc.abstractmethod + def apply(self, data: xr.DataArray) -> xr.DataArray: + """Apply the pre-processor. + + Parameters + ---------- + data: xr.DataArray + The data to process. + + Returns + ------- + xr.DataArray + + .. # noqa: DAR202 + """ + + +class CorrectBaselineValue(PreProcessor): + """Corrects a dataset by subtracting baseline value.""" + + action: Literal["baseline-value"] = "baseline-value" + value: float + + def apply(self, data: xr.DataArray) -> xr.DataArray: + """Apply the pre-processor. + + Parameters + ---------- + data: xr.DataArray + The data to process. + + Returns + ------- + xr.DataArray + """ + return data - self.value + + +class CorrectBaselineAverage(PreProcessor): + """Corrects a dataset by subtracting the average over a part of the data.""" + + action: Literal["baseline-average"] = "baseline-average" + selection: dict[str, slice | list[int] | int] + + def apply(self, data: xr.DataArray) -> xr.DataArray: + """Apply the pre-processor. + + Parameters + ---------- + data: xr.DataArray + The data to process. + + Returns + ------- + xr.DataArray + """ + selection = data.sel(self.selection) + return data - (selection.sum() / selection.size) diff --git a/glotaran/io/preprocessor/test/test_preprocessor.py b/glotaran/io/preprocessor/test/test_preprocessor.py new file mode 100644 index 000000000..e04de2da0 --- /dev/null +++ b/glotaran/io/preprocessor/test/test_preprocessor.py @@ -0,0 +1,35 @@ +import pytest +import xarray as xr + +from glotaran.io.preprocessor import PreProcessingPipeline + + +def test_correct_baseline_value(): + pl = PreProcessingPipeline() + pl.correct_baseline_value(1) + data = xr.DataArray([[1]]) + result = pl.apply(data) + assert result == data - 1 + + +@pytest.mark.parametrize("indexer", (slice(0, 2), [0, 1])) +def test_correct_baseline_average(indexer: slice | list[int]): + pl = PreProcessingPipeline() + pl.correct_baseline_average({"dim_0": 0, "dim_1": indexer}) + data = xr.DataArray([[1.1, 0.9]]) + result = pl.apply(data) + assert (result == data - 1).all() + + +def test_to_from_dict(): + pl = PreProcessingPipeline() + pl.correct_baseline_value(1) + pl.correct_baseline_average({"dim_1": slice(0, 2)}) + pl_dict = pl.dict() + assert pl_dict == { + "actions": [ + {"action": "baseline-value", "value": 1.0}, + {"action": "baseline-average", "selection": {"dim_1": slice(0, 2)}}, + ] + } + assert PreProcessingPipeline.parse_obj(pl_dict) == pl diff --git a/requirements_dev.txt b/requirements_dev.txt index 5982a1092..6502bdc94 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -12,6 +12,7 @@ numpy==1.23.5 odfpy==1.4.1 openpyxl==3.1.1 pandas==1.5.3 +pydantic==1.10.2 rich==13.3.1 ruamel.yaml==0.17.21 scipy==1.10.1 diff --git a/setup.cfg b/setup.cfg index b08492677..3080b3f3c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,6 +40,7 @@ install_requires = odfpy>=1.4.1 openpyxl>=3.0.10 pandas>=1.3.4 + pydantic>=1.10.2 rich>=10.9.0 ruamel.yaml>=0.17.17 scipy>=1.7.2 From c1dbca4465f38ff0b776c5ab6f9becfdbfa5142f Mon Sep 17 00:00:00 2001 From: Joern Weissenborn Date: Wed, 1 Mar 2023 19:43:59 +0100 Subject: [PATCH 2/3] Added exclude to correct_baseline_average. --- glotaran/io/preprocessor/pipeline.py | 11 ++++++++--- glotaran/io/preprocessor/preprocessor.py | 5 +++-- glotaran/io/preprocessor/test/test_preprocessor.py | 13 +++++++++++-- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/glotaran/io/preprocessor/pipeline.py b/glotaran/io/preprocessor/pipeline.py index 530831d88..9b1a14c6f 100644 --- a/glotaran/io/preprocessor/pipeline.py +++ b/glotaran/io/preprocessor/pipeline.py @@ -65,19 +65,24 @@ def correct_baseline_value(self, value: float) -> PreProcessingPipeline: return self def correct_baseline_average( - self, selection: dict[str, slice | list[int] | int] + self, + selection: dict[str, slice | list[int] | int] | None = None, + exclude: dict[str, slice | list[int] | int] | None = None, ) -> PreProcessingPipeline: """Correct a dataset by subtracting the average over a part of the data. Parameters ---------- - selection: dict[str, slice | list[int] | int] + selection: dict[str, slice | list[int] | int] | None The selection to average as dictionary of dimension and indexer. The indexer can be a slice, a list or an integer value. + exclude: dict[str, slice | list[int] | int] | None + Excluded regions from the average as dictionary of dimension and indexer. + The indexer can be a slice, a list or an integer value. Returns ------- PreProcessingPipeline """ - self._push_action(CorrectBaselineAverage(selection=selection)) + self._push_action(CorrectBaselineAverage(exclude=exclude, selection=selection)) return self diff --git a/glotaran/io/preprocessor/preprocessor.py b/glotaran/io/preprocessor/preprocessor.py index 7832dd759..fa8514791 100644 --- a/glotaran/io/preprocessor/preprocessor.py +++ b/glotaran/io/preprocessor/preprocessor.py @@ -58,7 +58,8 @@ class CorrectBaselineAverage(PreProcessor): """Corrects a dataset by subtracting the average over a part of the data.""" action: Literal["baseline-average"] = "baseline-average" - selection: dict[str, slice | list[int] | int] + selection: dict[str, slice | list[int] | int] | None = None + exclude: dict[str, slice | list[int] | int] | None = None def apply(self, data: xr.DataArray) -> xr.DataArray: """Apply the pre-processor. @@ -72,5 +73,5 @@ def apply(self, data: xr.DataArray) -> xr.DataArray: ------- xr.DataArray """ - selection = data.sel(self.selection) + selection = data.sel(self.selection or {}).drop_sel(self.exclude or {}) return data - (selection.sum() / selection.size) diff --git a/glotaran/io/preprocessor/test/test_preprocessor.py b/glotaran/io/preprocessor/test/test_preprocessor.py index e04de2da0..03e9de99e 100644 --- a/glotaran/io/preprocessor/test/test_preprocessor.py +++ b/glotaran/io/preprocessor/test/test_preprocessor.py @@ -15,12 +15,21 @@ def test_correct_baseline_value(): @pytest.mark.parametrize("indexer", (slice(0, 2), [0, 1])) def test_correct_baseline_average(indexer: slice | list[int]): pl = PreProcessingPipeline() - pl.correct_baseline_average({"dim_0": 0, "dim_1": indexer}) + pl.correct_baseline_average(selection={"dim_0": 0, "dim_1": indexer}) data = xr.DataArray([[1.1, 0.9]]) result = pl.apply(data) assert (result == data - 1).all() +def test_correct_baseline_average_exclude(): + pl = PreProcessingPipeline() + pl.correct_baseline_average(exclude={"dim_1": 1}) + data = xr.DataArray([[1.1, 0.9]]) + result = pl.apply(data) + print(result) + assert (result == data - 1.1).all() + + def test_to_from_dict(): pl = PreProcessingPipeline() pl.correct_baseline_value(1) @@ -29,7 +38,7 @@ def test_to_from_dict(): assert pl_dict == { "actions": [ {"action": "baseline-value", "value": 1.0}, - {"action": "baseline-average", "selection": {"dim_1": slice(0, 2)}}, + {"action": "baseline-average", "selection": {"dim_1": slice(0, 2)}, "exclude": None}, ] } assert PreProcessingPipeline.parse_obj(pl_dict) == pl From 9473bf56e48a303d47d4a6ec402643ac8c2da3a7 Mon Sep 17 00:00:00 2001 From: Joern Weissenborn Date: Wed, 1 Mar 2023 20:29:02 +0100 Subject: [PATCH 3/3] Tweaks. --- glotaran/io/preprocessor/pipeline.py | 6 +++--- glotaran/io/preprocessor/preprocessor.py | 5 ++--- glotaran/io/preprocessor/test/test_preprocessor.py | 6 +++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/glotaran/io/preprocessor/pipeline.py b/glotaran/io/preprocessor/pipeline.py index 9b1a14c6f..c331b8008 100644 --- a/glotaran/io/preprocessor/pipeline.py +++ b/glotaran/io/preprocessor/pipeline.py @@ -66,14 +66,14 @@ def correct_baseline_value(self, value: float) -> PreProcessingPipeline: def correct_baseline_average( self, - selection: dict[str, slice | list[int] | int] | None = None, + select: dict[str, slice | list[int] | int] | None = None, exclude: dict[str, slice | list[int] | int] | None = None, ) -> PreProcessingPipeline: """Correct a dataset by subtracting the average over a part of the data. Parameters ---------- - selection: dict[str, slice | list[int] | int] | None + select: dict[str, slice | list[int] | int] | None The selection to average as dictionary of dimension and indexer. The indexer can be a slice, a list or an integer value. exclude: dict[str, slice | list[int] | int] | None @@ -84,5 +84,5 @@ def correct_baseline_average( ------- PreProcessingPipeline """ - self._push_action(CorrectBaselineAverage(exclude=exclude, selection=selection)) + self._push_action(CorrectBaselineAverage(exclude=exclude, select=select)) return self diff --git a/glotaran/io/preprocessor/preprocessor.py b/glotaran/io/preprocessor/preprocessor.py index fa8514791..6d918a96e 100644 --- a/glotaran/io/preprocessor/preprocessor.py +++ b/glotaran/io/preprocessor/preprocessor.py @@ -58,7 +58,7 @@ class CorrectBaselineAverage(PreProcessor): """Corrects a dataset by subtracting the average over a part of the data.""" action: Literal["baseline-average"] = "baseline-average" - selection: dict[str, slice | list[int] | int] | None = None + select: dict[str, slice | list[int] | int] | None = None exclude: dict[str, slice | list[int] | int] | None = None def apply(self, data: xr.DataArray) -> xr.DataArray: @@ -73,5 +73,4 @@ def apply(self, data: xr.DataArray) -> xr.DataArray: ------- xr.DataArray """ - selection = data.sel(self.selection or {}).drop_sel(self.exclude or {}) - return data - (selection.sum() / selection.size) + return data - data.sel(self.select or {}).drop_sel(self.exclude or {}).mean() diff --git a/glotaran/io/preprocessor/test/test_preprocessor.py b/glotaran/io/preprocessor/test/test_preprocessor.py index 03e9de99e..f6a6e283e 100644 --- a/glotaran/io/preprocessor/test/test_preprocessor.py +++ b/glotaran/io/preprocessor/test/test_preprocessor.py @@ -15,7 +15,7 @@ def test_correct_baseline_value(): @pytest.mark.parametrize("indexer", (slice(0, 2), [0, 1])) def test_correct_baseline_average(indexer: slice | list[int]): pl = PreProcessingPipeline() - pl.correct_baseline_average(selection={"dim_0": 0, "dim_1": indexer}) + pl.correct_baseline_average(select={"dim_0": 0, "dim_1": indexer}) data = xr.DataArray([[1.1, 0.9]]) result = pl.apply(data) assert (result == data - 1).all() @@ -23,7 +23,7 @@ def test_correct_baseline_average(indexer: slice | list[int]): def test_correct_baseline_average_exclude(): pl = PreProcessingPipeline() - pl.correct_baseline_average(exclude={"dim_1": 1}) + pl.correct_baseline_average(select={"dim_0": 0}, exclude={"dim_1": 1}) data = xr.DataArray([[1.1, 0.9]]) result = pl.apply(data) print(result) @@ -38,7 +38,7 @@ def test_to_from_dict(): assert pl_dict == { "actions": [ {"action": "baseline-value", "value": 1.0}, - {"action": "baseline-average", "selection": {"dim_1": slice(0, 2)}, "exclude": None}, + {"action": "baseline-average", "select": {"dim_1": slice(0, 2)}, "exclude": None}, ] } assert PreProcessingPipeline.parse_obj(pl_dict) == pl