From 0ab1883854f6c59d6abb8fc6eaa41331d30e6ea4 Mon Sep 17 00:00:00 2001 From: srose <118634249+wastedareas@users.noreply.github.com> Date: Fri, 21 Jun 2024 14:39:03 +0200 Subject: [PATCH 01/12] Implemented RobustScaler; Tests still missing; Functionality not yet tested --- .../data/tabular/transformation/__init__.py | 3 + .../tabular/transformation/_robust_scaler.py | 190 ++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 src/safeds/data/tabular/transformation/_robust_scaler.py diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py index b7f19d22e..920f0b7e5 100644 --- a/src/safeds/data/tabular/transformation/__init__.py +++ b/src/safeds/data/tabular/transformation/__init__.py @@ -10,6 +10,7 @@ from ._label_encoder import LabelEncoder from ._one_hot_encoder import OneHotEncoder from ._range_scaler import RangeScaler + from ._robust_scaler import RobustScaler from ._simple_imputer import SimpleImputer from ._standard_scaler import StandardScaler from ._table_transformer import TableTransformer @@ -22,6 +23,7 @@ "LabelEncoder": "._label_encoder:LabelEncoder", "OneHotEncoder": "._one_hot_encoder:OneHotEncoder", "RangeScaler": "._range_scaler:RangeScaler", + "RobustScaler": "._robust_scaler:RobustScaler", "SimpleImputer": "._simple_imputer:SimpleImputer", "StandardScaler": "._standard_scaler:StandardScaler", "TableTransformer": "._table_transformer:TableTransformer", @@ -34,6 +36,7 @@ "LabelEncoder", "OneHotEncoder", "RangeScaler", + "RobustScaler", "SimpleImputer", "StandardScaler", "TableTransformer", diff --git a/src/safeds/data/tabular/transformation/_robust_scaler.py b/src/safeds/data/tabular/transformation/_robust_scaler.py new file mode 100644 index 000000000..854216183 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_robust_scaler.py @@ -0,0 +1,190 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds._validation import _check_columns_exist +from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric +from safeds.data.tabular.containers import Table +from safeds.exceptions import TransformerNotFittedError + +from ._invertible_table_transformer import InvertibleTableTransformer + +if TYPE_CHECKING: + import polars as pl + +class RobustScaler(InvertibleTableTransformer): + # Does it actually transform values to a range? + """ + The RobustScaler transforms column values to a range by removing the median and scaling to the interquartile range. + + Parameters + ---------- + column_names: + The list of columns used to fit the transformer. If `None`, all numeric columns are used. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self, *, column_names: str | list[str] | None = None) -> None: + super().__init__(column_names) + + # Internal state + self._data_median: pl.DataFrame | None = None + self._data_scale: pl.DataFrame | None = None + + def __hash__(self) -> int: + # Leave out the internal state for faster hashing + return super().__hash__() + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._data_median is not None and self._data_scale is not None + + # ------------------------------------------------------------------------------------------------------------------ + # Learning and transformation + # ------------------------------------------------------------------------------------------------------------------ + + def fit(self, table: Table) -> RobustScaler: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + ColumnNotFoundError + If column_names contain a column name that is missing in the table. + ColumnTypeError + If at least one of the specified columns in the table contains non-numerical data. + ValueError + If the table contains 0 rows. + """ + if self._column_names is None: + column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] + else: + column_names = self._column_names + _check_columns_exist(table, column_names) + _check_columns_are_numeric(table, column_names, operation="fit a RobustScaler") + + if table.row_count == 0: + raise ValueError("The RobustScaler cannot be fitted because the table contains 0 rows") + + # Learn the transformation (ddof=0 is used to match the behavior of scikit-learn) + _data_median = table._lazy_frame.select(column_names).median().collect() + q1 = table._lazy_frame.select(column_names).quantile(0.25).collect() + q3 = table._lazy_frame.select(column_names).quantile(0.75).collect() + _data_scale = q3 - q1 # TODO: Check if this works + + # Create a copy with the learned transformation + result = RobustScaler(column_names=column_names) + result._data_median = _data_median + result._data_scale = _data_scale + + return result + + def transform(self, table: Table) -> Table: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + ColumnNotFoundError + If the input table does not contain all columns used to fit the transformer. + ColumnTypeError + If at least one of the columns in the input table that is used to fit contains non-numerical data. + """ + import polars as pl + + # Used in favor of is_fitted, so the type checker is happy + if self._column_names is None or self._data_median is None or self._data_scale is None: + raise TransformerNotFittedError + + _check_columns_exist(table, self._column_names) + _check_columns_are_numeric(table, self._column_names, operation="transform with a RobustScaler") + + columns = [ + (pl.col(name) - self._data_median.get_column(name)) / self._data_scale.get_column(name) + for name in self._column_names + ] + + return Table._from_polars_lazy_frame( + table._lazy_frame.with_columns(columns), + ) + + def inverse_transform(self, transformed_table: Table) -> Table: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + original_table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + ColumnNotFoundError + If the input table does not contain all columns used to fit the transformer. + ColumnTypeError + If the transformed columns of the input table contain non-numerical data. + """ + import polars as pl + + # Used in favor of is_fitted, so the type checker is happy + if self._column_names is None or self._data_median is None or self._data_scale is None: + raise TransformerNotFittedError + + _check_columns_exist(transformed_table, self._column_names) + _check_columns_are_numeric( + transformed_table, + self._column_names, + operation="inverse-transform with a RobustScaler", + ) + + columns = [ + pl.col(name) * self._data_scale.get_column(name) + self._data_median.get_column(name) + for name in self._column_names + ] + + return Table._from_polars_lazy_frame( + transformed_table._lazy_frame.with_columns(columns), + ) \ No newline at end of file From 9e033dca99d0786e14233c5f069fd28894efa79b Mon Sep 17 00:00:00 2001 From: srose <118634249+wastedareas@users.noreply.github.com> Date: Fri, 21 Jun 2024 16:07:58 +0200 Subject: [PATCH 02/12] feat:implemented tests --- .../transformation/test_robust_scaler.py | 213 ++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 tests/safeds/data/tabular/transformation/test_robust_scaler.py diff --git a/tests/safeds/data/tabular/transformation/test_robust_scaler.py b/tests/safeds/data/tabular/transformation/test_robust_scaler.py new file mode 100644 index 000000000..ed3a5b495 --- /dev/null +++ b/tests/safeds/data/tabular/transformation/test_robust_scaler.py @@ -0,0 +1,213 @@ +import pytest +from safeds.data.tabular.containers import Table +from safeds.data.tabular.transformation import RobustScaler +from safeds.exceptions import ColumnNotFoundError, ColumnTypeError, TransformerNotFittedError + +from tests.helpers import assert_tables_equal + + +class TestFit: + def test_should_raise_if_column_not_found(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + with pytest.raises(ColumnNotFoundError): + RobustScaler(column_names=["col2", "col3"]).fit(table) + + def test_should_raise_if_table_contains_non_numerical_data(self) -> None: + with pytest.raises(ColumnTypeError): + RobustScaler(column_names=["col1", "col2"]).fit( + Table({"col1": ["one", "two", "apple"], "col2": ["three", "four", "banana"]}), + ) + + def test_should_raise_if_table_contains_no_rows(self) -> None: + with pytest.raises(ValueError, match=r"The RobustScaler cannot be fitted because the table contains 0 rows"): + RobustScaler().fit(Table({"col1": []})) + + def test_should_not_change_original_transformer(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + transformer = RobustScaler() + transformer.fit(table) + + assert transformer._column_names is None + assert transformer._data_median is None + assert transformer._data_scale is None + + +class TestTransform: + def test_should_raise_if_column_not_found(self) -> None: + table_to_fit = Table( + { + "col1": [0.0, 5.0, 10.0], + "col2": [5.0, 50.0, 100.0], + }, + ) + + transformer = RobustScaler().fit(table_to_fit) + + table_to_transform = Table( + { + "col3": ["a", "b", "c"], + }, + ) + + with pytest.raises(ColumnNotFoundError): + transformer.transform(table_to_transform) + + def test_should_raise_if_not_fitted(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + transformer = RobustScaler() + + with pytest.raises(TransformerNotFittedError, match=r"The transformer has not been fitted yet."): + transformer.transform(table) + + def test_should_raise_if_table_contains_non_numerical_data(self) -> None: + with pytest.raises(ColumnTypeError): + RobustScaler(column_names=["col1", "col2"]).fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]})).transform( + Table({"col1": ["a", "b", "c"], "col2": ["b", "c", "e"]}), + ) + + +class TestIsFitted: + def test_should_return_false_before_fitting(self) -> None: + transformer = RobustScaler() + assert not transformer.is_fitted + + def test_should_return_true_after_fitting(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + transformer = RobustScaler() + fitted_transformer = transformer.fit(table) + assert fitted_transformer.is_fitted + +# TODO: Replace values +class TestFitAndTransform: + @pytest.mark.parametrize( + ("table", "column_names", "expected"), + [ + ( + Table( + { + "col1": [1.0, 2.0, 3.0, 4.0], + "col2": [1.0, 2.0, 3.0, 4.0], + }, + ), + None, + Table( + { + "col1": [-1.5, -0.5, 0.5, 1.5], + "col2": [-1.5, -0.5, 0.5, 1.5], + }, + ), + ), + ], + ids=["two_columns"], + ) + def test_should_return_fitted_transformer_and_transformed_table( + self, + table: Table, + column_names: list[str] | None, + expected: Table, + ) -> None: + fitted_transformer, transformed_table = RobustScaler(column_names=column_names).fit_and_transform(table) + assert fitted_transformer.is_fitted + assert_tables_equal(transformed_table, expected) + + def test_should_not_change_original_table(self) -> None: + table = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + RobustScaler().fit_and_transform(table) + + expected = Table( + { + "col1": [0.0, 5.0, 10.0], + }, + ) + + assert table == expected + + +class TestInverseTransform: + @pytest.mark.parametrize( + "table", + [ + Table( + { + "col1": [1.0, 2.0, 3.0, 4.0], + }, + ), + ], + ids=["one_column"], + ) + def test_should_return_original_table(self, table: Table) -> None: + transformer = RobustScaler().fit(table) + + assert transformer.inverse_transform(transformer.transform(table)) == table + + def test_should_not_change_transformed_table(self) -> None: + table = Table( + { + "col1": [0.0, 0.5, 1.0, 1.5, 2.0], + }, + ) + + transformer = RobustScaler().fit(table) + transformed_table = transformer.transform(table) + transformed_table = transformer.inverse_transform(transformed_table) + + expected = Table( + { + "col1": [0.0, 0.5, 1.0, 1.5, 2.0], + }, + ) + + assert_tables_equal(transformed_table, expected) + + def test_should_raise_if_not_fitted(self) -> None: + table = Table( + { + "col1": [1.0, 2.0, 3.0, 4.0], + }, + ) + + transformer = RobustScaler() + + with pytest.raises(TransformerNotFittedError, match=r"The transformer has not been fitted yet."): + transformer.inverse_transform(table) + + def test_should_raise_if_column_not_found(self) -> None: + with pytest.raises(ColumnNotFoundError): + RobustScaler(column_names=["col1", "col2"]).fit( + Table({"col1": [1, 2, 3, 4], "col2": [2, 3, 4, 5]}), + ).inverse_transform( + Table({"col3": [0, 1, 2, 3]}), + ) + + def test_should_raise_if_table_contains_non_numerical_data(self) -> None: + with pytest.raises(ColumnTypeError): + RobustScaler(column_names=["col1", "col2"]).fit( + Table({"col1": [1, 2, 3, 4], "col2": [2, 3, 4, 5]}), + ).inverse_transform( + Table({"col1": ["one", "two", "apple"], "col2": ["three", "four", "banana"]}), + ) From 7ece2622bef2b72f7e15b40f06765618e3584b5b Mon Sep 17 00:00:00 2001 From: srose <118634249+wastedareas@users.noreply.github.com> Date: Fri, 21 Jun 2024 16:31:18 +0200 Subject: [PATCH 03/12] feat: test for division by zero while fitting --- .../data/tabular/transformation/_robust_scaler.py | 3 +-- .../tabular/transformation/test_robust_scaler.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/safeds/data/tabular/transformation/_robust_scaler.py b/src/safeds/data/tabular/transformation/_robust_scaler.py index 854216183..fe48f7b13 100644 --- a/src/safeds/data/tabular/transformation/_robust_scaler.py +++ b/src/safeds/data/tabular/transformation/_robust_scaler.py @@ -13,7 +13,6 @@ import polars as pl class RobustScaler(InvertibleTableTransformer): - # Does it actually transform values to a range? """ The RobustScaler transforms column values to a range by removing the median and scaling to the interquartile range. @@ -90,7 +89,7 @@ def fit(self, table: Table) -> RobustScaler: _data_median = table._lazy_frame.select(column_names).median().collect() q1 = table._lazy_frame.select(column_names).quantile(0.25).collect() q3 = table._lazy_frame.select(column_names).quantile(0.75).collect() - _data_scale = q3 - q1 # TODO: Check if this works + _data_scale = q3 - q1 # Create a copy with the learned transformation result = RobustScaler(column_names=column_names) diff --git a/tests/safeds/data/tabular/transformation/test_robust_scaler.py b/tests/safeds/data/tabular/transformation/test_robust_scaler.py index ed3a5b495..a8b8cd068 100644 --- a/tests/safeds/data/tabular/transformation/test_robust_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_robust_scaler.py @@ -41,6 +41,17 @@ def test_should_not_change_original_transformer(self) -> None: assert transformer._data_median is None assert transformer._data_scale is None + # TODO: fix this + def test_should_not_divide_by_zero(self) -> None: + table = Table( + { + "col1": [1.0, 1.0, 2.0, 1.0], + }, + ) + + transformer = RobustScaler() + transformer.fit(table) + class TestTransform: def test_should_raise_if_column_not_found(self) -> None: @@ -97,7 +108,6 @@ def test_should_return_true_after_fitting(self) -> None: fitted_transformer = transformer.fit(table) assert fitted_transformer.is_fitted -# TODO: Replace values class TestFitAndTransform: @pytest.mark.parametrize( ("table", "column_names", "expected"), From a57732cc6ef81ce799dbb0569995abe9213b1f69 Mon Sep 17 00:00:00 2001 From: srose <118634249+wastedareas@users.noreply.github.com> Date: Fri, 28 Jun 2024 11:26:59 +0200 Subject: [PATCH 04/12] Test not yet functional --- .../data/tabular/transformation/_robust_scaler.py | 13 +++++++++++++ .../tabular/transformation/test_robust_scaler.py | 4 +++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/transformation/_robust_scaler.py b/src/safeds/data/tabular/transformation/_robust_scaler.py index fe48f7b13..47ac64f1b 100644 --- a/src/safeds/data/tabular/transformation/_robust_scaler.py +++ b/src/safeds/data/tabular/transformation/_robust_scaler.py @@ -86,11 +86,24 @@ def fit(self, table: Table) -> RobustScaler: raise ValueError("The RobustScaler cannot be fitted because the table contains 0 rows") # Learn the transformation (ddof=0 is used to match the behavior of scikit-learn) + # n-tiles = [0.25, 0.125, 0.0675] _data_median = table._lazy_frame.select(column_names).median().collect() q1 = table._lazy_frame.select(column_names).quantile(0.25).collect() q3 = table._lazy_frame.select(column_names).quantile(0.75).collect() _data_scale = q3 - q1 + for col in column_names: + if _data_scale.select(col).value(0) == 0: + q1 = table._lazy_frame.select(column_names).quantile(0.125).collect() + q3 = table._lazy_frame.select(column_names).quantile(0.875).collect() + _data_scale.get_column(col).value(0) = q3 - q1 + if _data_scale.select(col).first() == 0: + q1 = table._lazy_frame.select(column_names).quantile(0.0675).collect() + q3 = table._lazy_frame.select(column_names).quantile(0.9325).collect() + _data_scale.get_column(col).first = q3 - q1 + if _data_scale.select(col).first() == 0: + _data_scale.get_column(col).first = 10 ** 5 + # Create a copy with the learned transformation result = RobustScaler(column_names=column_names) result._data_median = _data_median diff --git a/tests/safeds/data/tabular/transformation/test_robust_scaler.py b/tests/safeds/data/tabular/transformation/test_robust_scaler.py index a8b8cd068..9727a5c39 100644 --- a/tests/safeds/data/tabular/transformation/test_robust_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_robust_scaler.py @@ -46,11 +46,13 @@ def test_should_not_divide_by_zero(self) -> None: table = Table( { "col1": [1.0, 1.0, 2.0, 1.0], + "col2": [3.0, 3.0, 3.0, 3.0], }, ) transformer = RobustScaler() - transformer.fit(table) + f_transformer = transformer.fit(table) + table = f_transformer.transform(table) class TestTransform: From 3326fcab6fba8264121b06268c29d6351544568b Mon Sep 17 00:00:00 2001 From: srose <118634249+wastedareas@users.noreply.github.com> Date: Fri, 28 Jun 2024 11:27:27 +0200 Subject: [PATCH 05/12] Test not working yet --- src/safeds/data/tabular/transformation/_robust_scaler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/transformation/_robust_scaler.py b/src/safeds/data/tabular/transformation/_robust_scaler.py index 47ac64f1b..e2b1008e6 100644 --- a/src/safeds/data/tabular/transformation/_robust_scaler.py +++ b/src/safeds/data/tabular/transformation/_robust_scaler.py @@ -92,8 +92,11 @@ def fit(self, table: Table) -> RobustScaler: q3 = table._lazy_frame.select(column_names).quantile(0.75).collect() _data_scale = q3 - q1 + # + for col in column_names: - if _data_scale.select(col).value(0) == 0: + #if 0 in _data_scale.select(col).to_list() + if _data_scale.select(col).to_list() == [0]: q1 = table._lazy_frame.select(column_names).quantile(0.125).collect() q3 = table._lazy_frame.select(column_names).quantile(0.875).collect() _data_scale.get_column(col).value(0) = q3 - q1 From 0c1531e301d232e6735e7bbcb3efa59a71cff052 Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 28 Jun 2024 11:48:59 +0200 Subject: [PATCH 06/12] hopefully works now --- .../tabular/transformation/_robust_scaler.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/safeds/data/tabular/transformation/_robust_scaler.py b/src/safeds/data/tabular/transformation/_robust_scaler.py index e2b1008e6..3081f805c 100644 --- a/src/safeds/data/tabular/transformation/_robust_scaler.py +++ b/src/safeds/data/tabular/transformation/_robust_scaler.py @@ -91,21 +91,9 @@ def fit(self, table: Table) -> RobustScaler: q1 = table._lazy_frame.select(column_names).quantile(0.25).collect() q3 = table._lazy_frame.select(column_names).quantile(0.75).collect() _data_scale = q3 - q1 - - # - + # To make sure there is no division by zero for col in column_names: - #if 0 in _data_scale.select(col).to_list() - if _data_scale.select(col).to_list() == [0]: - q1 = table._lazy_frame.select(column_names).quantile(0.125).collect() - q3 = table._lazy_frame.select(column_names).quantile(0.875).collect() - _data_scale.get_column(col).value(0) = q3 - q1 - if _data_scale.select(col).first() == 0: - q1 = table._lazy_frame.select(column_names).quantile(0.0675).collect() - q3 = table._lazy_frame.select(column_names).quantile(0.9325).collect() - _data_scale.get_column(col).first = q3 - q1 - if _data_scale.select(col).first() == 0: - _data_scale.get_column(col).first = 10 ** 5 + _data_scale.with_columns(col = pl.when(pl.int_range(1) == 0).then(1).otherwise(pl.col(col))) # Create a copy with the learned transformation result = RobustScaler(column_names=column_names) From 732418e5a1f0a73fa7b92d74b4f4d7e0906d2ac2 Mon Sep 17 00:00:00 2001 From: srose <118634249+wastedareas@users.noreply.github.com> Date: Fri, 28 Jun 2024 12:18:32 +0200 Subject: [PATCH 07/12] RobustScaler now fully functional; test are wip --- src/safeds/data/tabular/transformation/_robust_scaler.py | 6 ++++-- .../data/tabular/transformation/test_robust_scaler.py | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/safeds/data/tabular/transformation/_robust_scaler.py b/src/safeds/data/tabular/transformation/_robust_scaler.py index 3081f805c..2f14515ee 100644 --- a/src/safeds/data/tabular/transformation/_robust_scaler.py +++ b/src/safeds/data/tabular/transformation/_robust_scaler.py @@ -75,6 +75,7 @@ def fit(self, table: Table) -> RobustScaler: ValueError If the table contains 0 rows. """ + import polars as pl if self._column_names is None: column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: @@ -91,9 +92,10 @@ def fit(self, table: Table) -> RobustScaler: q1 = table._lazy_frame.select(column_names).quantile(0.25).collect() q3 = table._lazy_frame.select(column_names).quantile(0.75).collect() _data_scale = q3 - q1 + # To make sure there is no division by zero - for col in column_names: - _data_scale.with_columns(col = pl.when(pl.int_range(1) == 0).then(1).otherwise(pl.col(col))) + for col_e in column_names: + _data_scale = _data_scale.with_columns(pl.when(pl.col(col_e) == 0).then(1).otherwise(pl.col(col_e)).alias(col_e)) # Create a copy with the learned transformation result = RobustScaler(column_names=column_names) diff --git a/tests/safeds/data/tabular/transformation/test_robust_scaler.py b/tests/safeds/data/tabular/transformation/test_robust_scaler.py index 9727a5c39..2746e3773 100644 --- a/tests/safeds/data/tabular/transformation/test_robust_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_robust_scaler.py @@ -55,6 +55,7 @@ def test_should_not_divide_by_zero(self) -> None: table = f_transformer.transform(table) + class TestTransform: def test_should_raise_if_column_not_found(self) -> None: table_to_fit = Table( From feaa669b675adbd9265527f8394102a78ede0891 Mon Sep 17 00:00:00 2001 From: srose <118634249+wastedareas@users.noreply.github.com> Date: Fri, 28 Jun 2024 12:22:49 +0200 Subject: [PATCH 08/12] Ready for pull request --- .../data/tabular/transformation/test_robust_scaler.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/safeds/data/tabular/transformation/test_robust_scaler.py b/tests/safeds/data/tabular/transformation/test_robust_scaler.py index 2746e3773..069e14544 100644 --- a/tests/safeds/data/tabular/transformation/test_robust_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_robust_scaler.py @@ -49,10 +49,16 @@ def test_should_not_divide_by_zero(self) -> None: "col2": [3.0, 3.0, 3.0, 3.0], }, ) - + target = Table( + { + "col1": [0.0, 0.0, 1.0, 0.0], + "col2": [0.0, 0.0, 0.0, 0.0], + } + ) transformer = RobustScaler() f_transformer = transformer.fit(table) table = f_transformer.transform(table) + assert(table == target) From 852686542eed61bdfbde1543fc2508ffb2b35273 Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 28 Jun 2024 12:46:04 +0200 Subject: [PATCH 09/12] added test coverage for NaN in column, those cases need to be handled --- src/safeds/data/tabular/transformation/_robust_scaler.py | 2 +- .../safeds/data/tabular/transformation/test_robust_scaler.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/safeds/data/tabular/transformation/_robust_scaler.py b/src/safeds/data/tabular/transformation/_robust_scaler.py index 2f14515ee..7ef38e602 100644 --- a/src/safeds/data/tabular/transformation/_robust_scaler.py +++ b/src/safeds/data/tabular/transformation/_robust_scaler.py @@ -14,7 +14,7 @@ class RobustScaler(InvertibleTableTransformer): """ - The RobustScaler transforms column values to a range by removing the median and scaling to the interquartile range. + The RobustScaler transforms column values to a range by removing the median and scaling to the interquartile range, though for columns with high stability it might only substract the median. Parameters ---------- diff --git a/tests/safeds/data/tabular/transformation/test_robust_scaler.py b/tests/safeds/data/tabular/transformation/test_robust_scaler.py index 069e14544..d15a0197c 100644 --- a/tests/safeds/data/tabular/transformation/test_robust_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_robust_scaler.py @@ -47,13 +47,15 @@ def test_should_not_divide_by_zero(self) -> None: { "col1": [1.0, 1.0, 2.0, 1.0], "col2": [3.0, 3.0, 3.0, 3.0], + "col3": [1.0, float("nan"), float("nan"), float("nan")], }, ) target = Table( { "col1": [0.0, 0.0, 1.0, 0.0], "col2": [0.0, 0.0, 0.0, 0.0], - } + "col3": [0.0, float("nan"), float("nan"), float("nan")], + }, ) transformer = RobustScaler() f_transformer = transformer.fit(table) From fa25bfe60e48a0ae9b25b5054b75dc767264d104 Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 28 Jun 2024 14:27:47 +0200 Subject: [PATCH 10/12] Works as long as no NaN is in the columns --- src/safeds/data/tabular/transformation/_robust_scaler.py | 5 +++-- .../data/tabular/transformation/test_robust_scaler.py | 8 +++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/safeds/data/tabular/transformation/_robust_scaler.py b/src/safeds/data/tabular/transformation/_robust_scaler.py index 7ef38e602..1fc8a272c 100644 --- a/src/safeds/data/tabular/transformation/_robust_scaler.py +++ b/src/safeds/data/tabular/transformation/_robust_scaler.py @@ -14,7 +14,9 @@ class RobustScaler(InvertibleTableTransformer): """ - The RobustScaler transforms column values to a range by removing the median and scaling to the interquartile range, though for columns with high stability it might only substract the median. + The RobustScaler transforms column values to a range by removing the median and scaling to the interquartile range. + + Currently for columns with high stability (IQR == 0) it will only substract the median and not scale to avoid dividing by zero. Parameters ---------- @@ -86,7 +88,6 @@ def fit(self, table: Table) -> RobustScaler: if table.row_count == 0: raise ValueError("The RobustScaler cannot be fitted because the table contains 0 rows") - # Learn the transformation (ddof=0 is used to match the behavior of scikit-learn) # n-tiles = [0.25, 0.125, 0.0675] _data_median = table._lazy_frame.select(column_names).median().collect() q1 = table._lazy_frame.select(column_names).quantile(0.25).collect() diff --git a/tests/safeds/data/tabular/transformation/test_robust_scaler.py b/tests/safeds/data/tabular/transformation/test_robust_scaler.py index d15a0197c..aa2c9d2c6 100644 --- a/tests/safeds/data/tabular/transformation/test_robust_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_robust_scaler.py @@ -41,20 +41,22 @@ def test_should_not_change_original_transformer(self) -> None: assert transformer._data_median is None assert transformer._data_scale is None - # TODO: fix this + # TODO: Tests for None and NaN values should be moved to their own function def test_should_not_divide_by_zero(self) -> None: table = Table( { "col1": [1.0, 1.0, 2.0, 1.0], "col2": [3.0, 3.0, 3.0, 3.0], - "col3": [1.0, float("nan"), float("nan"), float("nan")], + #"col3": [1.0, float("nan"), float("nan"), float("nan")], + "col4": [1.0, None, None, None], }, ) target = Table( { "col1": [0.0, 0.0, 1.0, 0.0], "col2": [0.0, 0.0, 0.0, 0.0], - "col3": [0.0, float("nan"), float("nan"), float("nan")], + #"col3": [0.0, float("nan"), float("nan"), float("nan")], + "col4": [0.0, None, None, None], }, ) transformer = RobustScaler() From 463897fc02debe8e1d2fd111b6130e364d3ff945 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Fri, 28 Jun 2024 13:30:12 +0000 Subject: [PATCH 11/12] style: apply automated linter fixes --- src/safeds/data/tabular/transformation/_robust_scaler.py | 8 ++++++-- .../data/tabular/transformation/test_robust_scaler.py | 8 ++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/safeds/data/tabular/transformation/_robust_scaler.py b/src/safeds/data/tabular/transformation/_robust_scaler.py index 1fc8a272c..70b71e1a3 100644 --- a/src/safeds/data/tabular/transformation/_robust_scaler.py +++ b/src/safeds/data/tabular/transformation/_robust_scaler.py @@ -12,6 +12,7 @@ if TYPE_CHECKING: import polars as pl + class RobustScaler(InvertibleTableTransformer): """ The RobustScaler transforms column values to a range by removing the median and scaling to the interquartile range. @@ -78,6 +79,7 @@ def fit(self, table: Table) -> RobustScaler: If the table contains 0 rows. """ import polars as pl + if self._column_names is None: column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: @@ -96,7 +98,9 @@ def fit(self, table: Table) -> RobustScaler: # To make sure there is no division by zero for col_e in column_names: - _data_scale = _data_scale.with_columns(pl.when(pl.col(col_e) == 0).then(1).otherwise(pl.col(col_e)).alias(col_e)) + _data_scale = _data_scale.with_columns( + pl.when(pl.col(col_e) == 0).then(1).otherwise(pl.col(col_e)).alias(col_e), + ) # Create a copy with the learned transformation result = RobustScaler(column_names=column_names) @@ -193,4 +197,4 @@ def inverse_transform(self, transformed_table: Table) -> Table: return Table._from_polars_lazy_frame( transformed_table._lazy_frame.with_columns(columns), - ) \ No newline at end of file + ) diff --git a/tests/safeds/data/tabular/transformation/test_robust_scaler.py b/tests/safeds/data/tabular/transformation/test_robust_scaler.py index aa2c9d2c6..6c18b4878 100644 --- a/tests/safeds/data/tabular/transformation/test_robust_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_robust_scaler.py @@ -47,7 +47,7 @@ def test_should_not_divide_by_zero(self) -> None: { "col1": [1.0, 1.0, 2.0, 1.0], "col2": [3.0, 3.0, 3.0, 3.0], - #"col3": [1.0, float("nan"), float("nan"), float("nan")], + # "col3": [1.0, float("nan"), float("nan"), float("nan")], "col4": [1.0, None, None, None], }, ) @@ -55,15 +55,14 @@ def test_should_not_divide_by_zero(self) -> None: { "col1": [0.0, 0.0, 1.0, 0.0], "col2": [0.0, 0.0, 0.0, 0.0], - #"col3": [0.0, float("nan"), float("nan"), float("nan")], + # "col3": [0.0, float("nan"), float("nan"), float("nan")], "col4": [0.0, None, None, None], }, ) transformer = RobustScaler() f_transformer = transformer.fit(table) table = f_transformer.transform(table) - assert(table == target) - + assert table == target class TestTransform: @@ -121,6 +120,7 @@ def test_should_return_true_after_fitting(self) -> None: fitted_transformer = transformer.fit(table) assert fitted_transformer.is_fitted + class TestFitAndTransform: @pytest.mark.parametrize( ("table", "column_names", "expected"), From 9ae94e8ec7215201d4958ea86ba61e392324572a Mon Sep 17 00:00:00 2001 From: Simon Date: Mon, 1 Jul 2024 11:42:36 +0200 Subject: [PATCH 12/12] resolved pending review conversations --- src/safeds/data/tabular/transformation/_robust_scaler.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/safeds/data/tabular/transformation/_robust_scaler.py b/src/safeds/data/tabular/transformation/_robust_scaler.py index 70b71e1a3..970b32d6c 100644 --- a/src/safeds/data/tabular/transformation/_robust_scaler.py +++ b/src/safeds/data/tabular/transformation/_robust_scaler.py @@ -17,7 +17,7 @@ class RobustScaler(InvertibleTableTransformer): """ The RobustScaler transforms column values to a range by removing the median and scaling to the interquartile range. - Currently for columns with high stability (IQR == 0) it will only substract the median and not scale to avoid dividing by zero. + Currently, for columns with high stability (IQR == 0), it will only substract the median and not scale to avoid dividing by zero. Parameters ---------- @@ -57,7 +57,7 @@ def fit(self, table: Table) -> RobustScaler: """ Learn a transformation for a set of columns in a table. - This transformer is not modified. + **Note:** This transformer is not modified. Parameters ---------- @@ -90,7 +90,6 @@ def fit(self, table: Table) -> RobustScaler: if table.row_count == 0: raise ValueError("The RobustScaler cannot be fitted because the table contains 0 rows") - # n-tiles = [0.25, 0.125, 0.0675] _data_median = table._lazy_frame.select(column_names).median().collect() q1 = table._lazy_frame.select(column_names).quantile(0.25).collect() q3 = table._lazy_frame.select(column_names).quantile(0.75).collect() @@ -113,7 +112,7 @@ def transform(self, table: Table) -> Table: """ Apply the learned transformation to a table. - The table is not modified. + **Note:** The given table is not modified. Parameters ---------- @@ -156,7 +155,7 @@ def inverse_transform(self, transformed_table: Table) -> Table: """ Undo the learned transformation. - The table is not modified. + **Note:** The given table is not modified. Parameters ----------