From 6fbe5379ae7676906bf8d0f21fe00926d792c60f Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 14 May 2024 22:17:26 +0200 Subject: [PATCH] feat: specify partial order in label encoder (#763) Closes #639 ### Summary of Changes * Optionally specify a partial order of labels in the label encoder * Performance: Implement RangeScaler, StandardScaler, LabelEncoder with polars --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> --- pyproject.toml | 2 +- .../_validation/_check_columns_are_numeric.py | 61 +++++ src/safeds/data/image/containers/_image.py | 6 +- .../data/image/containers/_image_list.py | 2 - .../containers/_multi_size_image_list.py | 4 +- .../containers/_single_size_image_list.py | 9 +- .../data/labeled/containers/_image_dataset.py | 12 +- src/safeds/data/tabular/containers/_table.py | 12 +- .../tabular/transformation/_discretizer.py | 87 ++---- .../tabular/transformation/_label_encoder.py | 211 ++++++--------- .../transformation/_one_hot_encoder.py | 118 +++----- .../tabular/transformation/_range_scaler.py | 253 +++++++----------- .../tabular/transformation/_simple_imputer.py | 105 +++----- .../transformation/_standard_scaler.py | 201 ++++---------- .../transformation/_table_transformer.py | 83 ++---- src/safeds/exceptions/__init__.py | 5 + .../ml/classical/_bases/_ada_boost_base.py | 1 - .../classical/_bases/_decision_tree_base.py | 1 - .../_bases/_gradient_boosting_base.py | 1 - .../_bases/_k_nearest_neighbors_base.py | 1 - .../classical/_bases/_random_forest_base.py | 1 - .../_bases/_support_vector_machine_base.py | 4 - src/safeds/ml/classical/_supervised_model.py | 2 +- .../_gradient_boosting_classifier.py | 3 - .../_support_vector_classifier.py | 8 - src/safeds/ml/nn/_model.py | 1 - .../nn/converters/_output_converter_image.py | 6 +- .../ml/nn/layers/_convolutional2d_layer.py | 1 - src/safeds/ml/nn/layers/_pooling2d_layer.py | 2 - tests/helpers/__init__.py | 4 +- tests/helpers/_assertions.py | 14 +- .../data/image/containers/test_image.py | 11 +- .../data/image/containers/test_image_list.py | 102 +++---- .../data/image/typing/test_image_size.py | 9 +- .../_tabular_dataset/test_into_dataloader.py | 5 +- .../labeled/containers/test_image_dataset.py | 101 +++++-- .../tabular/containers/_column/test_repr.py | 6 +- .../tabular/containers/_column/test_str.py | 6 +- .../tabular/containers/_table/test_hash.py | 10 +- .../_table/test_number_of_columns.py | 3 +- .../containers/_table/test_number_of_rows.py | 3 +- .../containers/_table/test_plot_histograms.py | 2 +- .../_table/test_remove_duplicate_rows.py | 3 +- .../_table/test_remove_rows_with_outliers.py | 2 - .../containers/_table/test_transform_table.py | 2 +- .../transformation/test_discretizer.py | 40 +-- .../transformation/test_label_encoder.py | 61 +---- .../transformation/test_one_hot_encoder.py | 42 +-- .../transformation/test_range_scaler.py | 70 +---- .../transformation/test_simple_imputer.py | 102 +++---- .../transformation/test_standard_scaler.py | 80 +----- .../transformation/test_table_transformer.py | 4 +- .../converters/test_input_converter_image.py | 4 - .../converters/test_output_converter_image.py | 6 - .../test_output_converter_time_series.py | 3 - .../nn/layers/test_convolutional2d_layer.py | 4 - .../safeds/ml/nn/layers/test_flatten_layer.py | 4 - tests/safeds/ml/nn/layers/test_lstm_layer.py | 1 - .../ml/nn/layers/test_pooling2d_layer.py | 4 - tests/safeds/ml/nn/test_cnn_workflow.py | 3 - 60 files changed, 672 insertions(+), 1242 deletions(-) create mode 100644 src/safeds/_validation/_check_columns_are_numeric.py diff --git a/pyproject.toml b/pyproject.toml index a5314c608..beb354ae3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,7 @@ omit = [ ] [tool.pytest.ini_options] -addopts = "--snapshot-warn-unused" +addopts = "--snapshot-warn-unused --tb=short" filterwarnings = [ "ignore:Deprecated call to `pkg_resources.declare_namespace", "ignore:Jupyter is migrating its paths to use standard platformdirs" diff --git a/src/safeds/_validation/_check_columns_are_numeric.py b/src/safeds/_validation/_check_columns_are_numeric.py new file mode 100644 index 000000000..0dae84220 --- /dev/null +++ b/src/safeds/_validation/_check_columns_are_numeric.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds.exceptions import ColumnTypeError + +if TYPE_CHECKING: + from collections.abc import Container + + from safeds.data.tabular.containers import Table + from safeds.data.tabular.typing import Schema + + +def _check_columns_are_numeric( + table_or_schema: Table | Schema, + column_names: str | list[str], + *, + operation: str = "do a numeric operation", +) -> None: + """ + Check if the columns with the specified names are numeric and raise an error if they are not. + + Missing columns are ignored. Use `_check_columns_exist` to check for missing columns. + + Parameters + ---------- + table_or_schema: + The table or schema to check. + column_names: + The column names to check. + operation: + The operation that is performed on the columns. This is used in the error message. + + Raises + ------ + ColumnTypeError + If a column exists but is not numeric. + """ + from safeds.data.tabular.containers import Table # circular import + + if isinstance(table_or_schema, Table): + table_or_schema = table_or_schema.schema + if isinstance(column_names, str): + column_names = [column_names] + + if len(column_names) > 1: + # Create a set for faster containment checks + known_names: Container = set(table_or_schema.column_names) + else: + known_names = table_or_schema.column_names + + non_numeric_names = [ + name for name in column_names if name in known_names and not table_or_schema.get_column_type(name).is_numeric + ] + if non_numeric_names: + message = _build_error_message(non_numeric_names, operation) + raise ColumnTypeError(message) + + +def _build_error_message(non_numeric_names: list[str], operation: str) -> str: + return f"Tried to {operation} on non-numeric columns {non_numeric_names}." diff --git a/src/safeds/data/image/containers/_image.py b/src/safeds/data/image/containers/_image.py index fb104ce8d..c8a316812 100644 --- a/src/safeds/data/image/containers/_image.py +++ b/src/safeds/data/image/containers/_image.py @@ -1,7 +1,6 @@ from __future__ import annotations import io -import os.path import sys import warnings from pathlib import Path @@ -79,9 +78,12 @@ def from_file(path: str | Path) -> Image: """ from torchvision.io import read_image + if isinstance(path, str): + path = Path(path) + _init_default_device() - if not os.path.isfile(path): + if not path.is_file(): raise FileNotFoundError(f"No such file or directory: '{path}'") return Image(image_tensor=read_image(str(path)).to(_get_device())) diff --git a/src/safeds/data/image/containers/_image_list.py b/src/safeds/data/image/containers/_image_list.py index 635958613..b35d00e99 100644 --- a/src/safeds/data/image/containers/_image_list.py +++ b/src/safeds/data/image/containers/_image_list.py @@ -283,7 +283,6 @@ def from_files( return image_list class _FromFileThreadPackage: - def __init__( self, im_files: list[str], @@ -323,7 +322,6 @@ def __len__(self) -> int: return len(self._im_files) class _FromImageThread(Thread): - def __init__(self, packages: list[ImageList._FromFileThreadPackage]) -> None: super().__init__() self._packages = packages diff --git a/src/safeds/data/image/containers/_multi_size_image_list.py b/src/safeds/data/image/containers/_multi_size_image_list.py index be1ca1789..5fcaceaca 100644 --- a/src/safeds/data/image/containers/_multi_size_image_list.py +++ b/src/safeds/data/image/containers/_multi_size_image_list.py @@ -66,7 +66,7 @@ def _create_from_single_sized_image_lists(single_size_image_lists: list[_SingleS single_size_image_list._indices_to_tensor_positions.keys(), [image_size] * len(single_size_image_list), strict=False, - ) + ), ) if max_channel is None: max_channel = single_size_image_list.channel @@ -80,7 +80,7 @@ def _create_from_single_sized_image_lists(single_size_image_lists: list[_SingleS for size in image_list._image_list_dict: if max_channel is not None and image_list._image_list_dict[size].channel != max_channel: image_list._image_list_dict[size] = image_list._image_list_dict[size].change_channel( - int(max_channel) + int(max_channel), ) return image_list diff --git a/src/safeds/data/image/containers/_single_size_image_list.py b/src/safeds/data/image/containers/_single_size_image_list.py index fea3c3529..035e5b14a 100644 --- a/src/safeds/data/image/containers/_single_size_image_list.py +++ b/src/safeds/data/image/containers/_single_size_image_list.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import TYPE_CHECKING -from safeds._config import _init_default_device, _get_device +from safeds._config import _get_device, _init_default_device from safeds._utils import _structural_hash from safeds.data.image._utils._image_transformation_error_and_warning_checks import ( _check_add_noise_errors, @@ -82,7 +82,12 @@ def _create_image_list_from_files( image_list = _SingleSizeImageList() images_tensor = torch.empty( - number_of_images, max_channel, height, width, dtype=torch.uint8, device=_get_device() + number_of_images, + max_channel, + height, + width, + dtype=torch.uint8, + device=_get_device(), ) thread_packages: list[ImageList._FromFileThreadPackage] = [] diff --git a/src/safeds/data/labeled/containers/_image_dataset.py b/src/safeds/data/labeled/containers/_image_dataset.py index 41af7f9f6..32430554a 100644 --- a/src/safeds/data/labeled/containers/_image_dataset.py +++ b/src/safeds/data/labeled/containers/_image_dataset.py @@ -89,7 +89,7 @@ def __init__(self, input_data: ImageList, output_data: T, batch_size: int = 1, s _output_size: int | ImageSize = output_data.number_of_columns elif isinstance(output_data, Column): _column_as_tensor = _ColumnAsTensor(output_data) - _output_size = len(_column_as_tensor._one_hot_encoder.get_names_of_added_columns()) + _output_size = len(_column_as_tensor._one_hot_encoder._get_names_of_added_columns()) _output = _column_as_tensor elif isinstance(output_data, _SingleSizeImageList): _output = output_data._clone()._as_single_size_image_list() @@ -289,7 +289,6 @@ def shuffle(self) -> ImageDataset[T]: class _TableAsTensor: - def __init__(self, table: Table) -> None: import torch @@ -345,7 +344,6 @@ def _to_table(self) -> Table: class _ColumnAsTensor: - def __init__(self, column: Column) -> None: import torch @@ -359,6 +357,8 @@ def __init__(self, column: Column) -> None: message=rf"The columns \['{self._column_name}'\] contain numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical values", category=UserWarning, ) + # TODO: should not one-hot-encode the target. label encoding without order is sufficient. should also not + # be done automatically? self._one_hot_encoder = OneHotEncoder().fit(column_as_table, [self._column_name]) self._tensor = torch.Tensor(self._one_hot_encoder.transform(column_as_table)._data_frame.to_torch()).to( _get_device(), @@ -394,9 +394,9 @@ def _from_tensor(tensor: Tensor, column_name: str, one_hot_encoder: OneHotEncode raise ValueError(f"Tensor has an invalid amount of dimensions. Needed 2 dimensions but got {tensor.dim()}.") if not one_hot_encoder.is_fitted: raise TransformerNotFittedError - if tensor.size(dim=1) != len(one_hot_encoder.get_names_of_added_columns()): + if tensor.size(dim=1) != len(one_hot_encoder._get_names_of_added_columns()): raise ValueError( - f"Tensor and one_hot_encoder have different amounts of classes ({tensor.size(dim=1)}!={len(one_hot_encoder.get_names_of_added_columns())}).", + f"Tensor and one_hot_encoder have different amounts of classes ({tensor.size(dim=1)}!={len(one_hot_encoder._get_names_of_added_columns())}).", ) table_as_tensor = _ColumnAsTensor.__new__(_ColumnAsTensor) table_as_tensor._tensor = tensor @@ -406,6 +406,6 @@ def _from_tensor(tensor: Tensor, column_name: str, one_hot_encoder: OneHotEncode def _to_column(self) -> Column: table = Table( - dict(zip(self._one_hot_encoder.get_names_of_added_columns(), self._tensor.T.tolist(), strict=False)), + dict(zip(self._one_hot_encoder._get_names_of_added_columns(), self._tensor.T.tolist(), strict=False)), ) return self._one_hot_encoder.inverse_transform(table).get_column(self._column_name) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index d05c433cf..d5c590c0f 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -323,7 +323,7 @@ def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None: # Implementation self._lazy_frame: pl.LazyFrame = pl.LazyFrame(data) - self.__data_frame_cache: pl.DataFrame | None = None + self.__data_frame_cache: pl.DataFrame | None = None # Scramble the name to prevent access from outside def __eq__(self, other: object) -> bool: if not isinstance(other, Table): @@ -1033,6 +1033,9 @@ def remove_duplicate_rows(self) -> Table: | 2 | 5 | +-----+-----+ """ + if self.number_of_columns == 0: + return self # Workaround for https://github.com/pola-rs/polars/issues/16207 + return Table._from_polars_lazy_frame( self._lazy_frame.unique(maintain_order=True), ) @@ -1221,6 +1224,8 @@ def remove_rows_with_outliers( | null | 8 | +------+-----+ """ + if self.number_of_rows == 0: + return self # polars raises a ComputeError for tables without rows if column_names is None: column_names = self.column_names @@ -1440,7 +1445,10 @@ def split_rows( The first table contains a percentage of the rows specified by `percentage_in_first`, and the second table contains the remaining rows. - **Note:** The original table is not modified. + **Notes:** + + - The original table is not modified. + - By default, the rows are shuffled before splitting. You can disable this by setting `shuffle` to False. Parameters ---------- diff --git a/src/safeds/data/tabular/transformation/_discretizer.py b/src/safeds/data/tabular/transformation/_discretizer.py index 614a8cbff..f55809dd6 100644 --- a/src/safeds/data/tabular/transformation/_discretizer.py +++ b/src/safeds/data/tabular/transformation/_discretizer.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING +from safeds._utils import _structural_hash from safeds._validation import _check_bounds, _check_columns_exist, _ClosedBound from safeds.data.tabular.containers import Table from safeds.exceptions import ( @@ -30,13 +31,36 @@ class Discretizer(TableTransformer): If the given number_of_bins is less than 2. """ - def __init__(self, number_of_bins: int = 5): + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self, number_of_bins: int = 5) -> None: + TableTransformer.__init__(self) + _check_bounds("number_of_bins", number_of_bins, lower_bound=_ClosedBound(2)) - self._column_names: list[str] | None = None self._wrapped_transformer: sk_KBinsDiscretizer | None = None self._number_of_bins = number_of_bins + def __hash__(self) -> int: + return _structural_hash( + TableTransformer.__hash__(self), + self._number_of_bins, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def number_of_bins(self) -> int: + return self._number_of_bins + + # ------------------------------------------------------------------------------------------------------------------ + # Learning and transformation + # ------------------------------------------------------------------------------------------------------------------ + def fit(self, table: Table, column_names: list[str] | None) -> Discretizer: """ Learn a transformation for a set of columns in a table. @@ -137,62 +161,3 @@ def transform(self, table: Table) -> Table: return Table._from_polars_lazy_frame( table._lazy_frame.update(new_data.lazy()), ) - - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return self._wrapped_transformer is not None - - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the Discretizer. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that may have been changed by the Discretizer. - - Returns - ------- - changed_columns: - The list of (potentially) changed column names, as passed to fit. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return self._column_names - - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the Discretizer. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the Discretizer was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] diff --git a/src/safeds/data/tabular/transformation/_label_encoder.py b/src/safeds/data/tabular/transformation/_label_encoder.py index 11f788dbb..532dfd1d5 100644 --- a/src/safeds/data/tabular/transformation/_label_encoder.py +++ b/src/safeds/data/tabular/transformation/_label_encoder.py @@ -1,24 +1,54 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING +from typing import Any +from safeds._utils import _structural_hash from safeds._validation import _check_columns_exist +from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.tabular.containers import Table -from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError +from safeds.exceptions import TransformerNotFittedError from ._invertible_table_transformer import InvertibleTableTransformer -if TYPE_CHECKING: - from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder - class LabelEncoder(InvertibleTableTransformer): - """The LabelEncoder encodes one or more given columns into labels.""" + """ + The LabelEncoder encodes one or more given columns into labels. + + Parameters + ---------- + partial_order: + The partial order of the labels. The labels are encoded in the order of the given list. Additional values are + encoded as the next integer after the last value in the list in the order they appear in the data. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self, *, partial_order: list[Any] | None = None) -> None: + super().__init__() + + if partial_order is None: + partial_order = [] + + # Parameters + self._partial_order = partial_order + + # Internal state + self._mapping: dict[str, dict[Any, int]] | None = None + self._inverse_mapping: dict[str, dict[int, Any]] | None = None + + def __hash__(self) -> int: + return _structural_hash( + super().__hash__(), + self._partial_order, + ) - def __init__(self) -> None: - self._wrapped_transformer: sk_OrdinalEncoder | None = None - self._column_names: list[str] | None = None + # ------------------------------------------------------------------------------------------------------------------ + # Learning and transformation + # ------------------------------------------------------------------------------------------------------------------ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder: """ @@ -45,8 +75,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder: ValueError If the table contains 0 rows. """ - from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder - if column_names is None: column_names = table.column_names else: @@ -55,27 +83,29 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder: if table.number_of_rows == 0: raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") - if table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns > 0: - warnings.warn( - "The columns" - f" {table.remove_columns_except(column_names).remove_non_numeric_columns().column_names} contain" - " numerical data. The LabelEncoder is designed to encode non-numerical values into numerical values", - UserWarning, - stacklevel=2, - ) - - # TODO: use polars Enum type instead: - # my_enum = pl.Enum(['A', 'B', 'C']) <-- create this from the given order - # my_data = pl.Series(['A', 'A', 'B'], dtype=my_enum) - wrapped_transformer = sk_OrdinalEncoder() - wrapped_transformer.set_output(transform="polars") - wrapped_transformer.fit( - table.remove_columns_except(column_names)._data_frame, - ) + _warn_if_columns_are_numeric(table, column_names) + + # Learn the transformation + mapping = {} + reverse_mapping = {} - result = LabelEncoder() - result._wrapped_transformer = wrapped_transformer + for name in column_names: + # Remember partial order + mapping[name] = {value: index for index, value in enumerate(self._partial_order)} + reverse_mapping[name] = {index: value for value, index in mapping[name].items()} + + unique_values = table.get_column(name).get_distinct_values() + for value in unique_values: + if value not in mapping[name]: + label = len(mapping[name]) + mapping[name][value] = label + reverse_mapping[name][label] = value + + # Create a copy with the learned transformation + result = LabelEncoder(partial_order=self._partial_order) result._column_names = column_names + result._mapping = mapping + result._inverse_mapping = reverse_mapping return result @@ -104,21 +134,18 @@ def transform(self, table: Table) -> Table: ValueError If the table contains 0 rows. """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: + import polars as pl + + # Used in favor of is_fitted, so the type checker is happy + if self._column_names is None or self._mapping is None: raise TransformerNotFittedError - # Input table does not contain all columns used to fit the transformer _check_columns_exist(table, self._column_names) - if table.number_of_rows == 0: - raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") + columns = [pl.col(name).replace(self._mapping[name], return_dtype=pl.UInt32) for name in self._column_names] - new_data = self._wrapped_transformer.transform( - table.remove_columns_except(self._column_names)._data_frame, - ) return Table._from_polars_lazy_frame( - table._lazy_frame.update(new_data.lazy()), + table._lazy_frame.with_columns(columns), ) def inverse_transform(self, transformed_table: Table) -> Table: @@ -143,99 +170,35 @@ def inverse_transform(self, transformed_table: Table) -> Table: If the transformer has not been fitted yet. ColumnNotFoundError If the input table does not contain all columns used to fit the transformer. - NonNumericColumnError + ColumnTypeError If the specified columns of the input table contain non-numerical data. - ValueError - If the table contains 0 rows. """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: + import polars as pl + + # Used in favor of is_fitted, so the type checker is happy + if self._column_names is None or self._inverse_mapping is None: raise TransformerNotFittedError _check_columns_exist(transformed_table, self._column_names) - - if transformed_table.number_of_rows == 0: - raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows") - - if transformed_table.remove_columns_except( + _check_columns_are_numeric( + transformed_table, self._column_names, - ).remove_non_numeric_columns().number_of_columns < len(self._column_names): - raise NonNumericColumnError( - str( - sorted( - set(self._column_names) - - set( - transformed_table.remove_columns_except(self._column_names) - .remove_non_numeric_columns() - .column_names, - ), - ), - ), - ) - - new_data = self._wrapped_transformer.inverse_transform( - transformed_table.remove_columns_except(self._column_names)._data_frame, + operation="inverse-transform with a LabelEncoder", ) - return Table._from_polars_lazy_frame( - transformed_table._lazy_frame.update(new_data.lazy()), - ) - - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return self._wrapped_transformer is not None - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the LabelEncoder. + columns = [pl.col(name).replace(self._inverse_mapping[name]) for name in self._column_names] - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - # (Must implement abstract method, cannot instantiate class otherwise.) - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that may have been changed by the LabelEncoder. - - Returns - ------- - changed_columns: - The list of (potentially) changed column names, as passed to fit. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return self._column_names - - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the LabelEncoder. + return Table._from_polars_lazy_frame( + transformed_table._lazy_frame.with_columns(columns), + ) - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the LabelEncoder was fitted on. - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] +def _warn_if_columns_are_numeric(table: Table, column_names: list[str]) -> None: + numeric_columns = table.remove_columns_except(column_names).remove_non_numeric_columns().column_names + if numeric_columns: + warnings.warn( + f"The columns {numeric_columns} contain numerical data. " + "The LabelEncoder is designed to encode non-numerical values into numerical values", + UserWarning, + stacklevel=2, + ) diff --git a/src/safeds/data/tabular/transformation/_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_one_hot_encoder.py index 8b56fda61..7882f663e 100644 --- a/src/safeds/data/tabular/transformation/_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_one_hot_encoder.py @@ -4,6 +4,7 @@ from collections import Counter from typing import Any +from safeds._utils import _structural_hash from safeds._validation import _check_columns_exist from safeds.data.tabular.containers import Column, Table from safeds.exceptions import ( @@ -56,26 +57,41 @@ class OneHotEncoder(InvertibleTableTransformer): 3 1.0 0.0 0.0 """ + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + def __init__(self) -> None: + super().__init__() + # Maps each old column to (list of) new columns created from it: - self._column_names: dict[str, list[str]] | None = None + self._column_map: dict[str, list[str]] | None = None # Maps concrete values (tuples of old column and value) to corresponding new column names: self._value_to_column: dict[tuple[str, Any], str] | None = None # Maps nan values (str of old column) to corresponding new column name self._value_to_column_nans: dict[str, str] | None = None - def __hash__(self) -> int: - return super().__hash__() - def __eq__(self, other: object) -> bool: if not isinstance(other, OneHotEncoder): return NotImplemented return ( - self._column_names == other._column_names + self._column_map == other._column_map and self._value_to_column == other._value_to_column and self._value_to_column_nans == other._value_to_column_nans ) + def __hash__(self) -> int: + return _structural_hash( + super().__hash__(), + self._column_map, + self._value_to_column, + self._value_to_column_nans, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Learning and transformation + # ------------------------------------------------------------------------------------------------------------------ + def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder: """ Learn a transformation for a set of columns in a table. @@ -121,8 +137,8 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder: ) result = OneHotEncoder() - - result._column_names = {} + result._column_names = column_names + result._column_map = {} result._value_to_column = {} result._value_to_column_nans = {} @@ -132,7 +148,7 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder: # Iterate through all columns to-be-changed: for column in column_names: - result._column_names[column] = [] + result._column_map[column] = [] for element in table.get_column(column).get_distinct_values(): base_name = f"{column}__{element}" name_counter[base_name] += 1 @@ -141,7 +157,7 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder: if name_counter[base_name] > 1: new_column_name += f"#{name_counter[base_name]}" # Update dictionary entries: - result._column_names[column] += [new_column_name] + result._column_map[column] += [new_column_name] if isinstance(element, float) and np.isnan(element): result._value_to_column_nans[column] = new_column_name else: @@ -180,11 +196,11 @@ def transform(self, table: Table) -> Table: import numpy as np # Transformer has not been fitted yet - if self._column_names is None or self._value_to_column is None or self._value_to_column_nans is None: + if self._column_map is None or self._value_to_column is None or self._value_to_column_nans is None: raise TransformerNotFittedError # Input table does not contain all columns used to fit the transformer - _check_columns_exist(table, list(self._column_names.keys())) + _check_columns_exist(table, list(self._column_map.keys())) if table.number_of_rows == 0: raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") @@ -196,7 +212,7 @@ def transform(self, table: Table) -> Table: encoded_values[new_column_name] = [0.0 for _ in range(table.number_of_rows)] values_not_present_when_fitted = [] - for old_column_name in self._column_names: + for old_column_name in self._column_map: for i in range(table.number_of_rows): value = table.get_column(old_column_name).get_value(i) try: @@ -210,7 +226,7 @@ def transform(self, table: Table) -> Table: # already present in the table the OneHotEncoder was fitted on. values_not_present_when_fitted.append((value, old_column_name)) - for new_column in self._column_names[old_column_name]: + for new_column in self._column_map[old_column_name]: table = table.add_columns([Column(new_column, encoded_values[new_column])]) if len(values_not_present_when_fitted) > 0: @@ -219,7 +235,7 @@ def transform(self, table: Table) -> Table: # New columns may not be sorted: column_names = [] for name in table.column_names: - if name not in self._column_names: + if name not in self._column_map: column_names.append(name) else: column_names.extend( @@ -230,7 +246,7 @@ def transform(self, table: Table) -> Table: # Drop old, non-encoded columns: # (Don't do this earlier - we need the old column nams for sorting, # plus we need to prevent the table from possibly having 0 columns temporarily.) - return table.remove_columns(list(self._column_names.keys())) + return table.remove_columns(list(self._column_map.keys())) def inverse_transform(self, transformed_table: Table) -> Table: """ @@ -260,10 +276,10 @@ def inverse_transform(self, transformed_table: Table) -> Table: If the table contains 0 rows. """ # Transformer has not been fitted yet - if self._column_names is None or self._value_to_column is None or self._value_to_column_nans is None: + if self._column_map is None or self._value_to_column is None or self._value_to_column_nans is None: raise TransformerNotFittedError - _transformed_column_names = [item for sublist in self._column_names.values() for item in sublist] + _transformed_column_names = [item for sublist in self._column_map.values() for item in sublist] _check_columns_exist(transformed_table, _transformed_column_names) @@ -287,7 +303,7 @@ def inverse_transform(self, transformed_table: Table) -> Table: ) original_columns = {} - for original_column_name in self._column_names: + for original_column_name in self._column_map: original_columns[original_column_name] = [None for _ in range(transformed_table.number_of_rows)] for original_column_name, value in self._value_to_column: @@ -311,66 +327,8 @@ def inverse_transform(self, transformed_table: Table) -> Table: table = table.remove_columns(list(self._value_to_column.values())) return table.remove_columns(list(self._value_to_column_nans.values())) - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return ( - self._column_names is not None - and self._value_to_column is not None - and self._value_to_column_nans is not None - ) - - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the OneHotEncoder. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return [name for column_names in self._column_names.values() for name in column_names] - - # (Must implement abstract method, cannot instantiate class otherwise.) - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that have been changed by the OneHotEncoder (none). - - Returns - ------- - changed_columns: - The empty list. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the OneHotEncoder. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the OneHotEncoder was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: + # TODO: remove / replace with consistent introspection methods across all transformers + def _get_names_of_added_columns(self) -> list[str]: + if self._column_map is None: raise TransformerNotFittedError - return list(self._column_names.keys()) + return [name for column_names in self._column_map.values() for name in column_names] diff --git a/src/safeds/data/tabular/transformation/_range_scaler.py b/src/safeds/data/tabular/transformation/_range_scaler.py index 0a0d0a1a2..833da4f7e 100644 --- a/src/safeds/data/tabular/transformation/_range_scaler.py +++ b/src/safeds/data/tabular/transformation/_range_scaler.py @@ -2,14 +2,16 @@ from typing import TYPE_CHECKING +from safeds._utils import _structural_hash from safeds._validation import _check_columns_exist +from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.tabular.containers import Table -from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError +from safeds.exceptions import TransformerNotFittedError from ._invertible_table_transformer import InvertibleTableTransformer if TYPE_CHECKING: - from sklearn.preprocessing import MinMaxScaler as sk_MinMaxScaler + import polars as pl class RangeScaler(InvertibleTableTransformer): @@ -29,13 +31,48 @@ class RangeScaler(InvertibleTableTransformer): If the given minimum is greater or equal to the given maximum """ - def __init__(self, min_: float = 0.0, max_: float = 1.0): - self._column_names: list[str] | None = None - self._wrapped_transformer: sk_MinMaxScaler | None = None + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self, min_: float = 0.0, max_: float = 1.0) -> None: + super().__init__() + if min_ >= max_: - raise ValueError('Parameter "maximum" must be higher than parameter "minimum".') - self._minimum = min_ - self._maximum = max_ + raise ValueError('Parameter "max_" must be greater than parameter "min_".') + + # Parameters + self._min: float = min_ + self._max: float = max_ + + # Internal state + self._data_min: pl.DataFrame | None = None + self._data_max: pl.DataFrame | None = None + + def __hash__(self) -> int: + return _structural_hash( + super().__hash__(), + self._min, + self._max, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def min(self) -> float: + """The minimum of the new range after the transformation.""" + return self._min + + @property + def max(self) -> float: + """The maximum of the new range after the transformation.""" + return self._max + + # ------------------------------------------------------------------------------------------------------------------ + # Learning and transformation + # ------------------------------------------------------------------------------------------------------------------ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler: """ @@ -48,7 +85,7 @@ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler: table: The table used to fit the transformer. column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. + The list of columns from the table used to fit the transformer. If None, all numeric columns are used. Returns ------- @@ -59,45 +96,29 @@ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler: ------ ColumnNotFoundError If column_names contain a column name that is missing in the table. - NonNumericColumnError - If at least one of the specified columns in the table contains non-numerical data. + ColumnTypeError + If at least one of the specified columns in the table is not numeric. ValueError If the table contains 0 rows. """ - from sklearn.preprocessing import MinMaxScaler as sk_MinMaxScaler - if column_names is None: - column_names = table.column_names + column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: _check_columns_exist(table, column_names) + _check_columns_are_numeric(table, column_names, operation="fit a RangeScaler") if table.number_of_rows == 0: raise ValueError("The RangeScaler cannot be fitted because the table contains 0 rows") - if ( - table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns - < table.remove_columns_except(column_names).number_of_columns - ): - raise NonNumericColumnError( - str( - sorted( - set(table.remove_columns_except(column_names).column_names) - - set( - table.remove_columns_except(column_names).remove_non_numeric_columns().column_names, - ), - ), - ), - ) + # Learn the transformation + _data_min = table._lazy_frame.select(column_names).min().collect() + _data_max = table._lazy_frame.select(column_names).max().collect() - wrapped_transformer = sk_MinMaxScaler((self._minimum, self._maximum)) - wrapped_transformer.set_output(transform="polars") - wrapped_transformer.fit( - table.remove_columns_except(column_names)._data_frame, - ) - - result = RangeScaler() - result._wrapped_transformer = wrapped_transformer + # Create a copy with the learned transformation + result = RangeScaler(min_=self._min, max_=self._max) result._column_names = column_names + result._data_min = _data_min + result._data_max = _data_max return result @@ -123,41 +144,30 @@ def transform(self, table: Table) -> Table: If the transformer has not been fitted yet. ColumnNotFoundError If the input table does not contain all columns used to fit the transformer. - NonNumericColumnError + ColumnTypeError If at least one of the columns in the input table that is used to fit contains non-numerical data. - ValueError - If the table contains 0 rows. """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: + import polars as pl + + # Used in favor of is_fitted, so the type checker is happy + if self._column_names is None or self._data_min is None or self._data_max is None: raise TransformerNotFittedError - # Input table does not contain all columns used to fit the transformer _check_columns_exist(table, self._column_names) - - if table.number_of_rows == 0: - raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") - - if ( - table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns - < table.remove_columns_except(self._column_names).number_of_columns - ): - raise NonNumericColumnError( - str( - sorted( - set(table.remove_columns_except(self._column_names).column_names) - - set( - table.remove_columns_except(self._column_names).remove_non_numeric_columns().column_names, - ), - ), - ), + _check_columns_are_numeric(table, self._column_names, operation="transform with a RangeScaler") + + columns = [ + ( + (pl.col(name) - self._data_min.get_column(name)) + / (self._data_max.get_column(name) - self._data_min.get_column(name)) + * (self._max - self._min) + + self._min ) + for name in self._column_names + ] - new_data = self._wrapped_transformer.transform( - table.remove_columns_except(self._column_names)._data_frame, - ) return Table._from_polars_lazy_frame( - table._lazy_frame.update(new_data.lazy()), + table._lazy_frame.with_columns(columns), ) def inverse_transform(self, transformed_table: Table) -> Table: @@ -182,109 +192,32 @@ def inverse_transform(self, transformed_table: Table) -> Table: If the transformer has not been fitted yet. ColumnNotFoundError If the input table does not contain all columns used to fit the transformer. - NonNumericColumnError + ColumnTypeError If the transformed columns of the input table contain non-numerical data. - ValueError - If the table contains 0 rows. """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: + import polars as pl + + # Used in favor of is_fitted, so the type checker is happy + if self._column_names is None or self._data_min is None or self._data_max is None: raise TransformerNotFittedError _check_columns_exist(transformed_table, self._column_names) - - if transformed_table.number_of_rows == 0: - raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") - - if ( - transformed_table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns - < transformed_table.remove_columns_except(self._column_names).number_of_columns - ): - raise NonNumericColumnError( - str( - sorted( - set(transformed_table.remove_columns_except(self._column_names).column_names) - - set( - transformed_table.remove_columns_except(self._column_names) - .remove_non_numeric_columns() - .column_names, - ), - ), - ), - ) - - import polars as pl - - new_data = pl.DataFrame( - self._wrapped_transformer.inverse_transform( - transformed_table.remove_columns_except(self._column_names)._data_frame, - ), + _check_columns_are_numeric( + transformed_table, + self._column_names, + operation="inverse-transform with a RangeScaler", ) - name_mapping = dict(zip(new_data.columns, self._column_names, strict=True)) - - new_data = new_data.rename(name_mapping) + columns = [ + ( + (pl.col(name) - self._min) + / (self._max - self._min) + * (self._data_max.get_column(name) - self._data_min.get_column(name)) + + self._data_min.get_column(name) + ) + for name in self._column_names + ] - return Table._from_polars_data_frame( - transformed_table._data_frame.update(new_data), + return Table._from_polars_lazy_frame( + transformed_table._lazy_frame.with_columns(columns), ) - - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return self._wrapped_transformer is not None - - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the RangeScaler. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - # (Must implement abstract method, cannot instantiate class otherwise.) - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that may have been changed by the RangeScaler. - - Returns - ------- - changed_columns: - The list of (potentially) changed column names, as passed to fit. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return self._column_names - - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the RangeScaler. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the RangeScaler was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] diff --git a/src/safeds/data/tabular/transformation/_simple_imputer.py b/src/safeds/data/tabular/transformation/_simple_imputer.py index ac0e1ea93..200174bb6 100644 --- a/src/safeds/data/tabular/transformation/_simple_imputer.py +++ b/src/safeds/data/tabular/transformation/_simple_imputer.py @@ -40,12 +40,16 @@ class SimpleImputer(TableTransformer): ... Column("b", [None, 2, 3]), ... ], ... ) - >>> transformer = SimpleImputer(SimpleImputer.Strategy.Constant(0)) + >>> transformer = SimpleImputer(SimpleImputer.Strategy.constant(0)) >>> transformed_table = transformer.fit_and_transform(table) """ class Strategy(ABC): - """Various strategies to replace missing values. Use the inner classes to create instances of this class.""" + """ + Various strategies to replace missing values. + + Use the static factory methods to create instances of this class. + """ @abstractmethod def __eq__(self, other: object) -> bool: ... @@ -61,7 +65,7 @@ def _apply(self, imputer: sk_SimpleImputer) -> None: """Set the imputer strategy of the given imputer.""" @staticmethod - def Constant(value: Any) -> SimpleImputer.Strategy: # noqa: N802 + def constant(value: Any) -> SimpleImputer.Strategy: """ Replace missing values with the given constant value. @@ -73,21 +77,27 @@ def Constant(value: Any) -> SimpleImputer.Strategy: # noqa: N802 return _Constant(value) @staticmethod - def Mean() -> SimpleImputer.Strategy: # noqa: N802 + def mean() -> SimpleImputer.Strategy: """Replace missing values with the mean of each column.""" return _Mean() @staticmethod - def Median() -> SimpleImputer.Strategy: # noqa: N802 + def median() -> SimpleImputer.Strategy: """Replace missing values with the median of each column.""" return _Median() @staticmethod - def Mode() -> SimpleImputer.Strategy: # noqa: N802 + def mode() -> SimpleImputer.Strategy: """Replace missing values with the mode of each column.""" return _Mode() - def __init__(self, strategy: SimpleImputer.Strategy, *, value_to_replace: float | str | None = None): + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self, strategy: SimpleImputer.Strategy, *, value_to_replace: float | str | None = None) -> None: + super().__init__() + if value_to_replace is None: value_to_replace = pd.NA @@ -95,7 +105,17 @@ def __init__(self, strategy: SimpleImputer.Strategy, *, value_to_replace: float self._value_to_replace = value_to_replace self._wrapped_transformer: sk_SimpleImputer | None = None - self._column_names: list[str] | None = None + + def __hash__(self) -> int: + return _structural_hash( + super().__hash__(), + self._strategy, + self._value_to_replace, + ) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ @property def strategy(self) -> SimpleImputer.Strategy: @@ -107,10 +127,9 @@ def value_to_replace(self) -> Any: """The value that should be replaced.""" return self._value_to_replace - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return self._wrapped_transformer is not None + # ------------------------------------------------------------------------------------------------------------------ + # Learning and transformation + # ------------------------------------------------------------------------------------------------------------------ def fit(self, table: Table, column_names: list[str] | None) -> SimpleImputer: """ @@ -233,60 +252,6 @@ def transform(self, table: Table) -> Table: table._lazy_frame.update(new_data.lazy()), ) - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the Imputer. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that may have been changed by the Imputer. - - Returns - ------- - changed_columns: - The list of (potentially) changed column names, as passed to fit. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return self._column_names - - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the Imputer. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the Imputer was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - # ---------------------------------------------------------------------------------------------------------------------- # Imputation strategies @@ -372,7 +337,7 @@ def _apply(self, imputer: sk_SimpleImputer) -> None: # Override the methods with classes, so they can be used in `isinstance` calls. Unlike methods, classes define a type. # This is needed for the DSL, where imputer strategies are variants of an enum. -SimpleImputer.Strategy.Constant = _Constant # type: ignore[method-assign] -SimpleImputer.Strategy.Mean = _Mean # type: ignore[method-assign] -SimpleImputer.Strategy.Median = _Median # type: ignore[method-assign] -SimpleImputer.Strategy.Mode = _Mode # type: ignore[method-assign] +SimpleImputer.Strategy.constant = _Constant # type: ignore[method-assign] +SimpleImputer.Strategy.mean = _Mean # type: ignore[method-assign] +SimpleImputer.Strategy.median = _Median # type: ignore[method-assign] +SimpleImputer.Strategy.mode = _Mode # type: ignore[method-assign] diff --git a/src/safeds/data/tabular/transformation/_standard_scaler.py b/src/safeds/data/tabular/transformation/_standard_scaler.py index 8a8122eb1..b008baa58 100644 --- a/src/safeds/data/tabular/transformation/_standard_scaler.py +++ b/src/safeds/data/tabular/transformation/_standard_scaler.py @@ -3,21 +3,36 @@ from typing import TYPE_CHECKING from safeds._validation import _check_columns_exist +from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.tabular.containers import Table -from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError +from safeds.exceptions import TransformerNotFittedError from ._invertible_table_transformer import InvertibleTableTransformer if TYPE_CHECKING: - from sklearn.preprocessing import StandardScaler as sk_StandardScaler + import polars as pl class StandardScaler(InvertibleTableTransformer): """The StandardScaler transforms column values to a range by removing the mean and scaling to unit variance.""" + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + def __init__(self) -> None: - self._column_names: list[str] | None = None - self._wrapped_transformer: sk_StandardScaler | None = None + super().__init__() + + # Internal state + self._data_mean: pl.DataFrame | None = None + self._data_standard_deviation: pl.DataFrame | None = None + + def __hash__(self) -> int: + return super().__hash__() + + # ------------------------------------------------------------------------------------------------------------------ + # Learning and transformation + # ------------------------------------------------------------------------------------------------------------------ def fit(self, table: Table, column_names: list[str] | None) -> StandardScaler: """ @@ -41,45 +56,29 @@ def fit(self, table: Table, column_names: list[str] | None) -> StandardScaler: ------ ColumnNotFoundError If column_names contain a column name that is missing in the table. - NonNumericColumnError + ColumnTypeError If at least one of the specified columns in the table contains non-numerical data. ValueError If the table contains 0 rows. """ - from sklearn.preprocessing import StandardScaler as sk_StandardScaler - if column_names is None: - column_names = table.column_names + column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: _check_columns_exist(table, column_names) + _check_columns_are_numeric(table, column_names, operation="fit a StandardScaler") if table.number_of_rows == 0: raise ValueError("The StandardScaler cannot be fitted because the table contains 0 rows") - if ( - table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns - < table.remove_columns_except(column_names).number_of_columns - ): - raise NonNumericColumnError( - str( - sorted( - set(table.remove_columns_except(column_names).column_names) - - set( - table.remove_columns_except(column_names).remove_non_numeric_columns().column_names, - ), - ), - ), - ) - - wrapped_transformer = sk_StandardScaler() - wrapped_transformer.set_output(transform="polars") - wrapped_transformer.fit( - table.remove_columns_except(column_names)._data_frame, - ) + # Learn the transformation (ddof=0 is used to match the behavior of scikit-learn) + _data_mean = table._lazy_frame.select(column_names).mean().collect() + _data_standard_deviation = table._lazy_frame.select(column_names).std(ddof=0).collect() + # Create a copy with the learned transformation result = StandardScaler() - result._wrapped_transformer = wrapped_transformer result._column_names = column_names + result._data_mean = _data_mean + result._data_standard_deviation = _data_standard_deviation return result @@ -105,41 +104,25 @@ def transform(self, table: Table) -> Table: If the transformer has not been fitted yet. ColumnNotFoundError If the input table does not contain all columns used to fit the transformer. - NonNumericColumnError + ColumnTypeError If at least one of the columns in the input table that is used to fit contains non-numerical data. - ValueError - If the table contains 0 rows. """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: + import polars as pl + + # Used in favor of is_fitted, so the type checker is happy + if self._column_names is None or self._data_mean is None or self._data_standard_deviation is None: raise TransformerNotFittedError - # Input table does not contain all columns used to fit the transformer _check_columns_exist(table, self._column_names) + _check_columns_are_numeric(table, self._column_names, operation="transform with a StandardScaler") + + columns = [ + (pl.col(name) - self._data_mean.get_column(name)) / self._data_standard_deviation.get_column(name) + for name in self._column_names + ] - if table.number_of_rows == 0: - raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") - - if ( - table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns - < table.remove_columns_except(self._column_names).number_of_columns - ): - raise NonNumericColumnError( - str( - sorted( - set(table.remove_columns_except(self._column_names).column_names) - - set( - table.remove_columns_except(self._column_names).remove_non_numeric_columns().column_names, - ), - ), - ), - ) - - new_data = self._wrapped_transformer.transform( - table.remove_columns_except(self._column_names)._data_frame, - ) return Table._from_polars_lazy_frame( - table._lazy_frame.update(new_data.lazy()), + table._lazy_frame.with_columns(columns), ) def inverse_transform(self, transformed_table: Table) -> Table: @@ -164,99 +147,27 @@ def inverse_transform(self, transformed_table: Table) -> Table: If the transformer has not been fitted yet. ColumnNotFoundError If the input table does not contain all columns used to fit the transformer. - NonNumericColumnError + ColumnTypeError If the transformed columns of the input table contain non-numerical data. - ValueError - If the table contains 0 rows. """ - # Transformer has not been fitted yet - if self._wrapped_transformer is None or self._column_names is None: + import polars as pl + + # Used in favor of is_fitted, so the type checker is happy + if self._column_names is None or self._data_mean is None or self._data_standard_deviation is None: raise TransformerNotFittedError _check_columns_exist(transformed_table, self._column_names) - - if transformed_table.number_of_rows == 0: - raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") - - if ( - transformed_table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns - < transformed_table.remove_columns_except(self._column_names).number_of_columns - ): - raise NonNumericColumnError( - str( - sorted( - set(transformed_table.remove_columns_except(self._column_names).column_names) - - set( - transformed_table.remove_columns_except(self._column_names) - .remove_non_numeric_columns() - .column_names, - ), - ), - ), - ) - - new_data = self._wrapped_transformer.inverse_transform( - transformed_table.remove_columns_except(self._column_names)._data_frame, + _check_columns_are_numeric( + transformed_table, + self._column_names, + operation="inverse-transform with a StandardScaler", ) - return Table._from_polars_data_frame( - transformed_table._data_frame.update(new_data), - ) - - @property - def is_fitted(self) -> bool: - """Whether the transformer is fitted.""" - return self._wrapped_transformer is not None - - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the StandardScaler. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] - - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that may have been changed by the StandardScaler. - - Returns - ------- - changed_columns: - The list of (potentially) changed column names, as passed to fit. - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if self._column_names is None: - raise TransformerNotFittedError - return self._column_names + columns = [ + pl.col(name) * self._data_standard_deviation.get_column(name) + self._data_mean.get_column(name) + for name in self._column_names + ] - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the StandardScaler. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the StandardScaler was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - if not self.is_fitted: - raise TransformerNotFittedError - return [] + return Table._from_polars_lazy_frame( + transformed_table._lazy_frame.with_columns(columns), + ) diff --git a/src/safeds/data/tabular/transformation/_table_transformer.py b/src/safeds/data/tabular/transformation/_table_transformer.py index 2968df41f..714fc503f 100644 --- a/src/safeds/data/tabular/transformation/_table_transformer.py +++ b/src/safeds/data/tabular/transformation/_table_transformer.py @@ -16,31 +16,30 @@ class TableTransformer(ABC): # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __hash__(self) -> int: - """ - Return a deterministic hash value for a table transformer. + # The decorator is needed so the class really cannot be instantiated + @abstractmethod + def __init__(self) -> None: + self._column_names: list[str] | None = None - Returns - ------- - hash: - The hash value. - """ - added = self.get_names_of_added_columns() if self.is_fitted else [] - changed = self.get_names_of_changed_columns() if self.is_fitted else [] - removed = self.get_names_of_removed_columns() if self.is_fitted else [] - return _structural_hash(self.__class__.__qualname__, self.is_fitted, added, changed, removed) + # The decorator ensures that the method is overridden in all subclasses + @abstractmethod + def __hash__(self) -> int: + return _structural_hash( + self.__class__.__qualname__, + self._column_names, + ) # ------------------------------------------------------------------------------------------------------------------ # Properties # ------------------------------------------------------------------------------------------------------------------ @property - @abstractmethod def is_fitted(self) -> bool: """Whether the transformer is fitted.""" + return self._column_names is not None # ------------------------------------------------------------------------------------------------------------------ - # Methods + # Learning and transformation # ------------------------------------------------------------------------------------------------------------------ @abstractmethod @@ -86,60 +85,10 @@ def transform(self, table: Table) -> Table: If the transformer has not been fitted yet. """ - # ------------------------------------------------------------------------------------------------------------------ - # Introspection - # ------------------------------------------------------------------------------------------------------------------ - - @abstractmethod - def get_names_of_added_columns(self) -> list[str]: - """ - Get the names of all new columns that have been added by the transformer. - - Returns - ------- - added_columns: - A list of names of the added columns, ordered as they will appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - - @abstractmethod - def get_names_of_changed_columns(self) -> list[str]: - """ - Get the names of all columns that have been changed by the transformer. - - Returns - ------- - changed_columns: - A list of names of changed columns, ordered as they appear in the table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - - @abstractmethod - def get_names_of_removed_columns(self) -> list[str]: - """ - Get the names of all columns that have been removed by the transformer. - - Returns - ------- - removed_columns: - A list of names of the removed columns, ordered as they appear in the table the transformer was fitted on. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ - def fit_and_transform( - self, table: Table, column_names: list[str] | None = None, + self, + table: Table, + column_names: list[str] | None = None, ) -> tuple[Self, Table]: """ Learn a transformation for a set of columns in a table and apply the learned transformation to the same table. diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py index 7592a1972..8f1e9de6d 100644 --- a/src/safeds/exceptions/__init__.py +++ b/src/safeds/exceptions/__init__.py @@ -34,6 +34,10 @@ class ColumnNotFoundError(SafeDsError): """Exception raised when trying to access an invalid column name.""" +class ColumnTypeError(SafeDsError): + """Exception raised when a column has the wrong type.""" + + class FileExtensionError(SafeDsError): """Exception raised when a path has the wrong file extension.""" @@ -45,6 +49,7 @@ class OutOfBoundsError(SafeDsError): __all__ = [ "SafeDsError", "ColumnNotFoundError", + "ColumnTypeError", "FileExtensionError", "OutOfBoundsError", # TODO diff --git a/src/safeds/ml/classical/_bases/_ada_boost_base.py b/src/safeds/ml/classical/_bases/_ada_boost_base.py index e6492a564..5264bc3a0 100644 --- a/src/safeds/ml/classical/_bases/_ada_boost_base.py +++ b/src/safeds/ml/classical/_bases/_ada_boost_base.py @@ -11,7 +11,6 @@ class _AdaBoostBase(ABC): - # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ diff --git a/src/safeds/ml/classical/_bases/_decision_tree_base.py b/src/safeds/ml/classical/_bases/_decision_tree_base.py index 6dd19d058..f71823696 100644 --- a/src/safeds/ml/classical/_bases/_decision_tree_base.py +++ b/src/safeds/ml/classical/_bases/_decision_tree_base.py @@ -7,7 +7,6 @@ class _DecisionTreeBase(ABC): - # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ diff --git a/src/safeds/ml/classical/_bases/_gradient_boosting_base.py b/src/safeds/ml/classical/_bases/_gradient_boosting_base.py index 51b4f9913..357577aeb 100644 --- a/src/safeds/ml/classical/_bases/_gradient_boosting_base.py +++ b/src/safeds/ml/classical/_bases/_gradient_boosting_base.py @@ -7,7 +7,6 @@ class _GradientBoostingBase(ABC): - # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ diff --git a/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py b/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py index ecb722238..70022b65a 100644 --- a/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py +++ b/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py @@ -7,7 +7,6 @@ class _KNearestNeighborsBase(ABC): - # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ diff --git a/src/safeds/ml/classical/_bases/_random_forest_base.py b/src/safeds/ml/classical/_bases/_random_forest_base.py index 94122e5d4..42c397b30 100644 --- a/src/safeds/ml/classical/_bases/_random_forest_base.py +++ b/src/safeds/ml/classical/_bases/_random_forest_base.py @@ -7,7 +7,6 @@ class _RandomForestBase(ABC): - # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ diff --git a/src/safeds/ml/classical/_bases/_support_vector_machine_base.py b/src/safeds/ml/classical/_bases/_support_vector_machine_base.py index 09e5577a1..fc85a4b58 100644 --- a/src/safeds/ml/classical/_bases/_support_vector_machine_base.py +++ b/src/safeds/ml/classical/_bases/_support_vector_machine_base.py @@ -117,7 +117,6 @@ def kernel(self) -> _SupportVectorMachineBase.Kernel: class _Linear(_SupportVectorMachineBase.Kernel): - # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ @@ -142,7 +141,6 @@ def _apply(self, model: SklearnSVC) -> None: class _Polynomial(_SupportVectorMachineBase.Kernel): - # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ @@ -188,7 +186,6 @@ def _apply(self, model: SklearnSVC) -> None: class _RadialBasisFunction(_SupportVectorMachineBase.Kernel): - # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ @@ -213,7 +210,6 @@ def _apply(self, model: SklearnSVC) -> None: class _Sigmoid(_SupportVectorMachineBase.Kernel): - # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ diff --git a/src/safeds/ml/classical/_supervised_model.py b/src/safeds/ml/classical/_supervised_model.py index 7824cf9e9..c4658bfc1 100644 --- a/src/safeds/ml/classical/_supervised_model.py +++ b/src/safeds/ml/classical/_supervised_model.py @@ -59,7 +59,7 @@ def is_fitted(self) -> bool: return None not in (self._feature_schema, self._target_name, self._target_type, self._wrapped_model) # ------------------------------------------------------------------------------------------------------------------ - # Machine learning + # Learning and prediction # ------------------------------------------------------------------------------------------------------------------ def fit(self, training_set: TabularDataset) -> Self: diff --git a/src/safeds/ml/classical/classification/_gradient_boosting_classifier.py b/src/safeds/ml/classical/classification/_gradient_boosting_classifier.py index 0d3c6ace2..73294108e 100644 --- a/src/safeds/ml/classical/classification/_gradient_boosting_classifier.py +++ b/src/safeds/ml/classical/classification/_gradient_boosting_classifier.py @@ -10,9 +10,6 @@ if TYPE_CHECKING: from sklearn.base import ClassifierMixin - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table - class GradientBoostingClassifier(Classifier, _GradientBoostingBase): """ diff --git a/src/safeds/ml/classical/classification/_support_vector_classifier.py b/src/safeds/ml/classical/classification/_support_vector_classifier.py index 509503765..407c8f97a 100644 --- a/src/safeds/ml/classical/classification/_support_vector_classifier.py +++ b/src/safeds/ml/classical/classification/_support_vector_classifier.py @@ -71,14 +71,6 @@ def _clone(self) -> SupportVectorClassifier: ) def _get_sklearn_model(self) -> ClassifierMixin: - """ - Return a new wrapped Classifier from sklearn. - - Returns - ------- - wrapped_classifier: - The sklearn Classifier. - """ from sklearn.svm import SVC as SklearnSVC # noqa: N811 result = SklearnSVC( diff --git a/src/safeds/ml/nn/_model.py b/src/safeds/ml/nn/_model.py index 6cdea9df0..3954606a3 100644 --- a/src/safeds/ml/nn/_model.py +++ b/src/safeds/ml/nn/_model.py @@ -493,7 +493,6 @@ def _create_internal_model( class _InternalModel(nn.Module): def __init__(self, layers: list[Layer], is_for_classification: bool) -> None: - super().__init__() self._layer_list = layers internal_layers = [] diff --git a/src/safeds/ml/nn/converters/_output_converter_image.py b/src/safeds/ml/nn/converters/_output_converter_image.py index e435a7543..5aad88599 100644 --- a/src/safeds/ml/nn/converters/_output_converter_image.py +++ b/src/safeds/ml/nn/converters/_output_converter_image.py @@ -19,7 +19,6 @@ class _OutputConversionImage(OutputConversion[ImageList, ImageDataset], ABC): - @abstractmethod def _data_conversion(self, input_data: ImageList, output_data: Tensor, **kwargs: Any) -> ImageDataset: pass # pragma: no cover @@ -66,7 +65,6 @@ def __sizeof__(self) -> int: class OutputConversionImageToColumn(_OutputConversionImage): - def _data_conversion(self, input_data: ImageList, output_data: Tensor, **kwargs: Any) -> ImageDataset[Column]: import torch @@ -85,7 +83,7 @@ def _data_conversion(self, input_data: ImageList, output_data: Tensor, **kwargs: one_hot_encoder: OneHotEncoder = kwargs["one_hot_encoder"] column_name: str = kwargs["column_name"] - output = torch.zeros(len(input_data), len(one_hot_encoder.get_names_of_added_columns())) + output = torch.zeros(len(input_data), len(one_hot_encoder._get_names_of_added_columns())) output[torch.arange(len(input_data)), output_data] = 1 im_dataset: ImageDataset[Column] = ImageDataset[Column].__new__(ImageDataset) @@ -100,7 +98,6 @@ def _data_conversion(self, input_data: ImageList, output_data: Tensor, **kwargs: class OutputConversionImageToTable(_OutputConversionImage): - def _data_conversion(self, input_data: ImageList, output_data: Tensor, **kwargs: Any) -> ImageDataset[Table]: import torch @@ -133,7 +130,6 @@ def _data_conversion(self, input_data: ImageList, output_data: Tensor, **kwargs: class OutputConversionImageToImage(_OutputConversionImage): - def _data_conversion( self, input_data: ImageList, diff --git a/src/safeds/ml/nn/layers/_convolutional2d_layer.py b/src/safeds/ml/nn/layers/_convolutional2d_layer.py index 3adbf0d8e..70b717487 100644 --- a/src/safeds/ml/nn/layers/_convolutional2d_layer.py +++ b/src/safeds/ml/nn/layers/_convolutional2d_layer.py @@ -246,7 +246,6 @@ def __sizeof__(self) -> int: class ConvolutionalTranspose2DLayer(Convolutional2DLayer): - def __init__( self, output_channel: int, diff --git a/src/safeds/ml/nn/layers/_pooling2d_layer.py b/src/safeds/ml/nn/layers/_pooling2d_layer.py index 9a615b376..ffd6c2f9d 100644 --- a/src/safeds/ml/nn/layers/_pooling2d_layer.py +++ b/src/safeds/ml/nn/layers/_pooling2d_layer.py @@ -177,7 +177,6 @@ def __sizeof__(self) -> int: class MaxPooling2DLayer(_Pooling2DLayer): - def __init__(self, kernel_size: int, *, stride: int = -1, padding: int = 0) -> None: """ Create a maximum Pooling 2D Layer. @@ -195,7 +194,6 @@ def __init__(self, kernel_size: int, *, stride: int = -1, padding: int = 0) -> N class AveragePooling2DLayer(_Pooling2DLayer): - def __init__(self, kernel_size: int, *, stride: int = -1, padding: int = 0) -> None: """ Create a average Pooling 2D Layer. diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py index 883650a52..20a9ac339 100644 --- a/tests/helpers/__init__.py +++ b/tests/helpers/__init__.py @@ -1,5 +1,5 @@ from ._assertions import ( - assert_that_tables_are_close, + assert_tables_equal, assert_that_tabular_datasets_are_equal, ) from ._devices import ( @@ -36,7 +36,7 @@ from ._resources import resolve_resource_path __all__ = [ - "assert_that_tables_are_close", + "assert_tables_equal", "assert_that_tabular_datasets_are_equal", "configure_test_with_device", "device_cpu", diff --git a/tests/helpers/_assertions.py b/tests/helpers/_assertions.py index be13f978a..8976fc063 100644 --- a/tests/helpers/_assertions.py +++ b/tests/helpers/_assertions.py @@ -1,9 +1,9 @@ -import pytest +from polars.testing import assert_frame_equal from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.containers import Table -def assert_that_tables_are_close(table1: Table, table2: Table) -> None: +def assert_tables_equal(table1: Table, table2: Table) -> None: """ Assert that two tables are almost equal. @@ -14,15 +14,7 @@ def assert_that_tables_are_close(table1: Table, table2: Table) -> None: table2: Table The table to compare the first table to. """ - assert table1.schema == table2.schema - for column_name in table1.column_names: - assert table1.get_column(column_name).type == table2.get_column(column_name).type - assert table1.get_column(column_name).type.is_numeric - assert table2.get_column(column_name).type.is_numeric - for i in range(table1.number_of_rows): - entry_1 = table1.get_column(column_name).get_value(i) - entry_2 = table2.get_column(column_name).get_value(i) - assert entry_1 == pytest.approx(entry_2) + assert_frame_equal(table1._data_frame, table2._data_frame) def assert_that_tabular_datasets_are_equal(table1: TabularDataset, table2: TabularDataset) -> None: diff --git a/tests/safeds/data/image/containers/test_image.py b/tests/safeds/data/image/containers/test_image.py index 5b1e51672..ad3ae820b 100644 --- a/tests/safeds/data/image/containers/test_image.py +++ b/tests/safeds/data/image/containers/test_image.py @@ -109,7 +109,6 @@ def test_should_write_and_load_bytes_png(self, resource_path: str | Path, device @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestToNumpyArray: - @pytest.mark.parametrize( "resource_path", images_all(), @@ -192,9 +191,12 @@ def test_should_raise_if_image_has_alpha_channel(self, resource_path: str | Path image = Image.from_file(resolve_resource_path(resource_path)) with NamedTemporaryFile(suffix=".jpg") as tmp_jpeg_file: tmp_jpeg_file.close() - with Path(tmp_jpeg_file.name).open("w", encoding="utf-8") as tmp_file, pytest.raises( - IllegalFormatError, - match=r"This format is illegal. Use one of the following formats: png", + with ( + Path(tmp_jpeg_file.name).open("w", encoding="utf-8") as tmp_file, + pytest.raises( + IllegalFormatError, + match=r"This format is illegal. Use one of the following formats: png", + ), ): image.to_jpeg_file(tmp_file.name) @@ -1025,7 +1027,6 @@ def test_should_return_edges_of_image( class TestFilterEdgesKernel: - def test_should_kernel_change_device(self) -> None: assert Image._filter_edges_kernel().device == _get_device() configure_test_with_device(device_cpu) diff --git a/tests/safeds/data/image/containers/test_image_list.py b/tests/safeds/data/image/containers/test_image_list.py index 5e8f8fb96..9b27f8f52 100644 --- a/tests/safeds/data/image/containers/test_image_list.py +++ b/tests/safeds/data/image/containers/test_image_list.py @@ -6,8 +6,6 @@ import pytest import torch -from torch.types import Device - from safeds.data.image.containers import Image, ImageList from safeds.data.image.containers._empty_image_list import _EmptyImageList from safeds.data.image.containers._multi_size_image_list import _MultiSizeImageList @@ -16,8 +14,12 @@ from safeds.exceptions import DuplicateIndexError, IllegalFormatError, IndexOutOfBoundsError, OutOfBoundsError from syrupy import SnapshotAssertion from torch import Tensor +from torch.types import Device from tests.helpers import ( + configure_test_with_device, + get_devices, + get_devices_ids, grayscale_jpg_path, grayscale_png_path, images_all, @@ -31,9 +33,6 @@ skip_if_os, test_images_folder, white_square_jpg_path, - get_devices, - get_devices_ids, - configure_test_with_device, ) @@ -42,7 +41,6 @@ @pytest.mark.parametrize("resource_path1", images_all(), ids=images_all_ids()) @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestAllImageCombinations: - def test_from_files(self, resource_path1: str, resource_path2: str, resource_path3: str, device: Device) -> None: # Setup configure_test_with_device(device) @@ -442,7 +440,6 @@ def test_from_files(self, resource_path1: str, resource_path2: str, resource_pat @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestFromFiles: - @pytest.mark.parametrize( "resource_path", [ @@ -463,7 +460,10 @@ class TestFromFiles: ], ) def test_from_files_creation_return_filenames( - self, resource_path: str | Path, snapshot_png_image_list: SnapshotAssertion, device: Device + self, + resource_path: str | Path, + snapshot_png_image_list: SnapshotAssertion, + device: Device, ) -> None: configure_test_with_device(device) image_list = ImageList.from_files(resolve_resource_path(resource_path)) @@ -495,7 +495,10 @@ def test_from_files_creation_return_filenames( ], ) def test_from_files_creation_load_percentage( - self, resource_path: str | Path, snapshot_png_image_list: SnapshotAssertion, device: Device + self, + resource_path: str | Path, + snapshot_png_image_list: SnapshotAssertion, + device: Device, ) -> None: random.seed(420) configure_test_with_device(device) @@ -534,7 +537,10 @@ def test_should_raise_if_one_file_or_directory_not_found(self, resource_path: st [-1.0, 2.0], ) def test_should_raise_if_load_percentage_out_of_bounds( - self, resource_path: str | Path, load_percentage: float, device: Device + self, + resource_path: str | Path, + load_percentage: float, + device: Device, ) -> None: configure_test_with_device(device) with pytest.raises(OutOfBoundsError): @@ -544,7 +550,7 @@ def test_create_from_single_sized_image_lists_one_image_list(self, device: Devic configure_test_with_device(device) assert isinstance( _MultiSizeImageList()._create_from_single_sized_image_lists( - [ImageList.from_files(resolve_resource_path(plane_png_path))._as_single_size_image_list()] + [ImageList.from_files(resolve_resource_path(plane_png_path))._as_single_size_image_list()], ), _SingleSizeImageList, ) @@ -555,7 +561,10 @@ def test_create_from_single_sized_image_lists_one_image_list(self, device: Devic ids=["all-images"], ) def test_create_from_single_sized_image_lists( - self, resource_path: str | Path, snapshot_png_image_list: SnapshotAssertion, device: Device + self, + resource_path: str | Path, + snapshot_png_image_list: SnapshotAssertion, + device: Device, ) -> None: configure_test_with_device(device) image_lists = ImageList.from_files(resolve_resource_path(resource_path)) @@ -569,7 +578,6 @@ def test_create_from_single_sized_image_lists( @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestToImages: - @pytest.mark.parametrize( "resource_path", [images_all(), [plane_png_path, plane_jpg_path] * 2], @@ -600,7 +608,6 @@ def test_from_files_creation(self, resource_path: list[str], device: Device) -> @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestToJpegFiles: - @pytest.mark.parametrize( "resource_path", [images_all(), [plane_png_path, plane_jpg_path]], @@ -692,7 +699,9 @@ def test_should_save_images_in_directory(self, resource_path: list[str], device: ids=["all-jpg-images", "jpg-planes", "jpg-grayscale"], ) def test_should_save_images_in_directories_for_different_sizes( - self, resource_path: list[str], device: Device + self, + resource_path: list[str], + device: Device, ) -> None: configure_test_with_device(device) image_list = ImageList.from_files(resolve_resource_path(resource_path)) @@ -757,7 +766,6 @@ def test_should_save_images_in_files(self, resource_path: list[str], device: Dev @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestToPngFiles: - @pytest.mark.parametrize( "resource_path", [images_all(), [plane_png_path, plane_jpg_path], [grayscale_png_path, grayscale_png_path]], @@ -794,7 +802,9 @@ def test_should_save_images_in_directory(self, resource_path: list[str], device: ids=["all-images", "planes", "grayscale"], ) def test_should_save_images_in_directories_for_different_sizes( - self, resource_path: list[str], device: Device + self, + resource_path: list[str], + device: Device, ) -> None: configure_test_with_device(device) image_list = ImageList.from_files(resolve_resource_path(resource_path)) @@ -842,14 +852,16 @@ def test_should_save_images_in_files(self, resource_path: list[str], device: Dev @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestShuffleImages: - @pytest.mark.parametrize( "resource_path", [images_all(), [plane_png_path, plane_jpg_path] * 2], ids=["all-images", "planes"], ) def test_shuffle_images( - self, resource_path: list[str], snapshot_png_image_list: SnapshotAssertion, device: Device + self, + resource_path: list[str], + snapshot_png_image_list: SnapshotAssertion, + device: Device, ) -> None: configure_test_with_device(device) image_list_original = ImageList.from_files(resolve_resource_path(resource_path)) @@ -871,7 +883,6 @@ def test_shuffle_images( @pytest.mark.parametrize("resource_path1", images_all_channel(), ids=images_all_channel_ids()) @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestTransformsEqualImageTransforms: - @pytest.mark.parametrize( ("method", "attributes"), [ @@ -1016,9 +1027,7 @@ def test_change_channel_of_tensor(self, channel_in: int, channel_out: int, devic ) @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestErrorsAndWarningsWithoutEmptyImageList: - class TestAddImageTensor: - def test_should_raise(self, resource_path: list[str], device: Device) -> None: configure_test_with_device(device) image_list = ImageList.from_files(resolve_resource_path(resource_path)) @@ -1026,14 +1035,12 @@ def test_should_raise(self, resource_path: list[str], device: Device) -> None: image_list._add_image_tensor(image_list.to_images([0])[0]._image_tensor, 0) class TestEquals: - def test_should_raise(self, resource_path: list[str], device: Device) -> None: configure_test_with_device(device) image_list_original = ImageList.from_files(resolve_resource_path(resource_path)) assert (image_list_original.__eq__(image_list_original.to_images([0]))) is NotImplemented class TestCrop: - @pytest.mark.parametrize( ("new_x", "new_y"), [(10000, 1), (1, 10000), (10000, 10000)], @@ -1057,7 +1064,6 @@ def test_should_warn_if_coordinates_outsize_image( assert torch.all(torch.eq(cropped_image_list._as_single_size_image_list()._tensor, image_blank_tensor)) class TestAdjustColorBalance: - def test_should_not_adjust_color_balance_channel_1( self, resource_path: list[str], @@ -1084,9 +1090,7 @@ def test_should_not_adjust_color_balance_channel_1( ) @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestErrorsAndWarningsWithEmptyImageList: - class TestChangeChannel: - @pytest.mark.parametrize( "channel", [-1, 0, 2, 5], @@ -1102,7 +1106,6 @@ def test_should_raise(self, resource_path: list[str], channel: int, device: Devi image_list.change_channel(channel) class TestRemoveImageByIndex: - def test_should_raise_invalid_index(self, resource_path: list[str], device: Device) -> None: configure_test_with_device(device) image_list = ImageList.from_files(resolve_resource_path(resource_path)) @@ -1112,14 +1115,17 @@ def test_should_raise_invalid_index(self, resource_path: list[str], device: Devi image_list.remove_image_by_index(len(image_list)) class TestRemoveImagesWithSize: - @pytest.mark.parametrize( ("width", "height"), [(-10, 10), (10, -10), (-10, -10)], ids=["invalid width", "invalid height", "invalid width and height"], ) def test_should_raise_negative_size( - self, resource_path: list[str], width: int, height: int, device: Device + self, + resource_path: list[str], + width: int, + height: int, + device: Device, ) -> None: configure_test_with_device(device) image_list = ImageList.from_files(resolve_resource_path(resource_path)) @@ -1127,14 +1133,17 @@ def test_should_raise_negative_size( image_list.remove_images_with_size(width, height) class TestResize: - @pytest.mark.parametrize( ("new_width", "new_height"), [(-10, 10), (10, -10), (-10, -10)], ids=["invalid width", "invalid height", "invalid width and height"], ) def test_should_raise_new_size( - self, resource_path: list[str], new_width: int, new_height: int, device: Device + self, + resource_path: list[str], + new_width: int, + new_height: int, + device: Device, ) -> None: configure_test_with_device(device) image_list = ImageList.from_files(resolve_resource_path(resource_path)) @@ -1142,14 +1151,17 @@ def test_should_raise_new_size( image_list.resize(new_width, new_height) class TestCrop: - @pytest.mark.parametrize( ("new_width", "new_height"), [(-10, 1), (1, -10), (-10, -1)], ids=["invalid width", "invalid height", "invalid width and height"], ) def test_should_raise_invalid_size( - self, resource_path: list[str], new_width: int, new_height: int, device: Device + self, + resource_path: list[str], + new_width: int, + new_height: int, + device: Device, ) -> None: configure_test_with_device(device) image_list = ImageList.from_files(resolve_resource_path(resource_path)) @@ -1162,7 +1174,11 @@ def test_should_raise_invalid_size( ids=["invalid x", "invalid y", "invalid x and y"], ) def test_should_raise_invalid_coordinates( - self, resource_path: list[str], new_x: int, new_y: int, device: Device + self, + resource_path: list[str], + new_x: int, + new_y: int, + device: Device, ) -> None: configure_test_with_device(device) image_list = ImageList.from_files(resolve_resource_path(resource_path)) @@ -1170,14 +1186,16 @@ def test_should_raise_invalid_coordinates( image_list.crop(new_x, new_y, 100, 100) class TestAddNoise: - @pytest.mark.parametrize( "standard_deviation", [-1], ids=["sigma below zero"], ) def test_should_raise_standard_deviation( - self, resource_path: list[str], standard_deviation: float, device: Device + self, + resource_path: list[str], + standard_deviation: float, + device: Device, ) -> None: configure_test_with_device(device) image_list_original = ImageList.from_files(resolve_resource_path(resource_path)) @@ -1187,7 +1205,6 @@ def test_should_raise_standard_deviation( assert image_list_original == image_list_clone class TestAdjustBrightness: - @pytest.mark.parametrize( "factor", [-1], @@ -1223,7 +1240,6 @@ def test_should_not_brighten( assert image_list_original == image_list_clone class TestAdjustContrast: - @pytest.mark.parametrize( "factor", [-1], @@ -1259,7 +1275,6 @@ def test_should_not_adjust( assert image_list_original == image_list_clone class TestAdjustColorBalance: - @pytest.mark.parametrize( "factor", [-1], @@ -1295,7 +1310,6 @@ def test_should_not_adjust_color_balance_factor_1( assert image_list_original == image_list_clone class TestBlur: - def test_should_raise_radius_out_of_bounds(self, resource_path: str, device: Device) -> None: configure_test_with_device(device) image_list_original = ImageList.from_files(resolve_resource_path(resource_path)) @@ -1325,7 +1339,6 @@ def test_should_not_blur(self, resource_path: str, device: Device) -> None: assert image_list_original == image_list_clone class TestSharpen: - @pytest.mark.parametrize( "factor", [-1], @@ -1363,7 +1376,6 @@ def test_should_not_adjust( @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestSingleSizeImageList: - @pytest.mark.parametrize( "tensor", [ @@ -1467,7 +1479,6 @@ def test_get_batch_and_iterate_4_dim(self, tensor: Tensor, device: Device) -> No @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestEmptyImageList: - def test_warn_empty_image_list(self, device: Device) -> None: configure_test_with_device(device) with pytest.warns( @@ -1493,7 +1504,8 @@ def test_create_image_list(self, image_list: ImageList, device: Device) -> None: def test_create_image_list_from_files(self, device: Device) -> None: configure_test_with_device(device) assert isinstance( - _SingleSizeImageList()._create_image_list_from_files({}, 0, 4, 1, 1, {}, 5)[0], _EmptyImageList + _SingleSizeImageList()._create_image_list_from_files({}, 0, 4, 1, 1, {}, 5)[0], + _EmptyImageList, ) def test_create_from_single_sized_image_lists(self, device: Device) -> None: diff --git a/tests/safeds/data/image/typing/test_image_size.py b/tests/safeds/data/image/typing/test_image_size.py index 88cfe0b8e..f8622c2de 100644 --- a/tests/safeds/data/image/typing/test_image_size.py +++ b/tests/safeds/data/image/typing/test_image_size.py @@ -8,18 +8,17 @@ from torch.types import Device from tests.helpers import ( + configure_test_with_device, get_devices, get_devices_ids, images_all, images_all_ids, plane_png_path, resolve_resource_path, - configure_test_with_device, ) class TestFromImage: - @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) @pytest.mark.parametrize("resource_path", images_all(), ids=images_all_ids()) def test_should_create(self, resource_path: str, device: Device) -> None: @@ -30,7 +29,6 @@ def test_should_create(self, resource_path: str, device: Device) -> None: class TestEq: - @pytest.mark.parametrize(("image_size", "width", "height", "channel"), [(ImageSize(1, 2, 3), 1, 2, 3)]) def test_should_be_equal(self, image_size: ImageSize, width: int, height: int, channel: int) -> None: assert image_size == ImageSize(width, height, channel) @@ -52,7 +50,6 @@ def test_should_be_not_implemented(self, image_size: ImageSize, other: Any) -> N class TestHash: - @pytest.mark.parametrize( "resource_path", images_all(), @@ -68,21 +65,18 @@ def test_hash_should_not_be_equal(self) -> None: class TestSizeOf: - @pytest.mark.parametrize("image_size", [ImageSize(1, 2, 3)]) def test_should_size_be_greater_than_normal_object(self, image_size: ImageSize) -> None: assert sys.getsizeof(image_size) >= sys.getsizeof(0) * 3 class TestStr: - @pytest.mark.parametrize("image_size", [ImageSize(1, 2, 3)]) def test_should_size_be_greater_than_normal_object(self, image_size: ImageSize) -> None: assert str(image_size) == f"{image_size.width}x{image_size.height}x{image_size.channel} (WxHxC)" class TestProperties: - @pytest.mark.parametrize("width", list(range(1, 5))) @pytest.mark.parametrize("height", list(range(1, 5))) @pytest.mark.parametrize("channel", [1, 3, 4]) @@ -98,7 +92,6 @@ def test_should_ignore_invalid_channel(self, channel: int) -> None: class TestErrors: - @pytest.mark.parametrize("width", [-1, 0]) def test_should_raise_invalid_width(self, width: int) -> None: with pytest.raises(OutOfBoundsError): diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_into_dataloader.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_into_dataloader.py index 5ff34f91c..512cc1b14 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_into_dataloader.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_into_dataloader.py @@ -1,11 +1,10 @@ import pytest -from torch.types import Device - from safeds._config import _get_device from safeds.data.tabular.containers import Table +from torch.types import Device from torch.utils.data import DataLoader -from tests.helpers import get_devices, get_devices_ids, configure_test_with_device +from tests.helpers import configure_test_with_device, get_devices, get_devices_ids @pytest.mark.parametrize( diff --git a/tests/safeds/data/labeled/containers/test_image_dataset.py b/tests/safeds/data/labeled/containers/test_image_dataset.py index 120e18d80..ab0cbe197 100644 --- a/tests/safeds/data/labeled/containers/test_image_dataset.py +++ b/tests/safeds/data/labeled/containers/test_image_dataset.py @@ -38,7 +38,6 @@ @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestImageDatasetInit: - @pytest.mark.parametrize( ("input_data", "output_data", "error", "error_msg"), [ @@ -100,7 +99,12 @@ class TestImageDatasetInit: ], ) def test_should_raise_with_invalid_data( - self, input_data: ImageList, output_data: T, error: type[Exception], error_msg: str, device: Device, + self, + input_data: ImageList, + output_data: T, + error: type[Exception], + error_msg: str, + device: Device, ) -> None: configure_test_with_device(device) with pytest.raises(error, match=error_msg): @@ -109,7 +113,6 @@ def test_should_raise_with_invalid_data( @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestLength: - def test_should_return_length(self, device: Device) -> None: configure_test_with_device(device) image_dataset = ImageDataset(ImageList.from_files(resolve_resource_path(plane_png_path)), Column("images", [1])) @@ -120,7 +123,6 @@ def test_should_return_length(self, device: Device) -> None: @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestEq: - @pytest.mark.parametrize( "image_dataset_output", [ @@ -131,8 +133,22 @@ class TestEq: ) def test_should_be_equal(self, image_dataset_output: str | Column | Table, device: Device) -> None: configure_test_with_device(device) - image_dataset1 = ImageDataset(ImageList.from_files(resolve_resource_path(plane_png_path)), ImageList.from_files(resolve_resource_path(image_dataset_output)) if isinstance(image_dataset_output, str) else image_dataset_output) # type: ignore[type-var] - image_dataset2 = ImageDataset(ImageList.from_files(resolve_resource_path(plane_png_path)), ImageList.from_files(resolve_resource_path(image_dataset_output)) if isinstance(image_dataset_output, str) else image_dataset_output) # type: ignore[type-var] + image_dataset1 = ImageDataset( + ImageList.from_files(resolve_resource_path(plane_png_path)), + ( + ImageList.from_files(resolve_resource_path(image_dataset_output)) + if isinstance(image_dataset_output, str) + else image_dataset_output + ), + ) # type: ignore[type-var] + image_dataset2 = ImageDataset( + ImageList.from_files(resolve_resource_path(plane_png_path)), + ( + ImageList.from_files(resolve_resource_path(image_dataset_output)) + if isinstance(image_dataset_output, str) + else image_dataset_output + ), + ) # type: ignore[type-var] assert image_dataset1 is not image_dataset2 assert image_dataset1 == image_dataset2 assert image_dataset1._input._tensor.device == _get_device() @@ -169,8 +185,22 @@ def test_should_not_be_equal( device: Device, ) -> None: configure_test_with_device(device) - image_dataset1 = ImageDataset(ImageList.from_files(resolve_resource_path(plane_png_path)), ImageList.from_files(resolve_resource_path(image_dataset1_output)) if isinstance(image_dataset1_output, str) else image_dataset1_output) # type: ignore[type-var] - image_dataset2 = ImageDataset(ImageList.from_files(resolve_resource_path(image_dataset2_input)), ImageList.from_files(resolve_resource_path(image_dataset2_output)) if isinstance(image_dataset2_output, str) else image_dataset2_output) # type: ignore[type-var] + image_dataset1 = ImageDataset( + ImageList.from_files(resolve_resource_path(plane_png_path)), + ( + ImageList.from_files(resolve_resource_path(image_dataset1_output)) + if isinstance(image_dataset1_output, str) + else image_dataset1_output + ), + ) # type: ignore[type-var] + image_dataset2 = ImageDataset( + ImageList.from_files(resolve_resource_path(image_dataset2_input)), + ( + ImageList.from_files(resolve_resource_path(image_dataset2_output)) + if isinstance(image_dataset2_output, str) + else image_dataset2_output + ), + ) # type: ignore[type-var] assert image_dataset1 != image_dataset2 assert image_dataset1._input._tensor.device == _get_device() assert image_dataset1._output._tensor.device == _get_device() @@ -186,7 +216,6 @@ def test_should_be_not_implemented(self, device: Device) -> None: @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestHash: - @pytest.mark.parametrize( "image_dataset_output", [ @@ -197,8 +226,22 @@ class TestHash: ) def test_hash_should_be_equal(self, image_dataset_output: str | Column | Table, device: Device) -> None: configure_test_with_device(device) - image_dataset1 = ImageDataset(ImageList.from_files(resolve_resource_path(plane_png_path)), ImageList.from_files(resolve_resource_path(image_dataset_output)) if isinstance(image_dataset_output, str) else image_dataset_output) # type: ignore[type-var] - image_dataset2 = ImageDataset(ImageList.from_files(resolve_resource_path(plane_png_path)), ImageList.from_files(resolve_resource_path(image_dataset_output)) if isinstance(image_dataset_output, str) else image_dataset_output) # type: ignore[type-var] + image_dataset1 = ImageDataset( + ImageList.from_files(resolve_resource_path(plane_png_path)), + ( + ImageList.from_files(resolve_resource_path(image_dataset_output)) + if isinstance(image_dataset_output, str) + else image_dataset_output + ), + ) # type: ignore[type-var] + image_dataset2 = ImageDataset( + ImageList.from_files(resolve_resource_path(plane_png_path)), + ( + ImageList.from_files(resolve_resource_path(image_dataset_output)) + if isinstance(image_dataset_output, str) + else image_dataset_output + ), + ) # type: ignore[type-var] assert image_dataset1 is not image_dataset2 assert hash(image_dataset1) == hash(image_dataset2) assert image_dataset1._input._tensor.device == _get_device() @@ -235,8 +278,22 @@ def test_hash_should_not_be_equal( device: Device, ) -> None: configure_test_with_device(device) - image_dataset1 = ImageDataset(ImageList.from_files(resolve_resource_path(plane_png_path)), ImageList.from_files(resolve_resource_path(image_dataset1_output)) if isinstance(image_dataset1_output, str) else image_dataset1_output) # type: ignore[type-var] - image_dataset2 = ImageDataset(ImageList.from_files(resolve_resource_path(image_dataset2_input)), ImageList.from_files(resolve_resource_path(image_dataset2_output)) if isinstance(image_dataset2_output, str) else image_dataset2_output) # type: ignore[type-var] + image_dataset1 = ImageDataset( + ImageList.from_files(resolve_resource_path(plane_png_path)), + ( + ImageList.from_files(resolve_resource_path(image_dataset1_output)) + if isinstance(image_dataset1_output, str) + else image_dataset1_output + ), + ) # type: ignore[type-var] + image_dataset2 = ImageDataset( + ImageList.from_files(resolve_resource_path(image_dataset2_input)), + ( + ImageList.from_files(resolve_resource_path(image_dataset2_output)) + if isinstance(image_dataset2_output, str) + else image_dataset2_output + ), + ) # type: ignore[type-var] assert hash(image_dataset1) != hash(image_dataset2) assert image_dataset1._input._tensor.device == _get_device() assert image_dataset1._output._tensor.device == _get_device() @@ -246,7 +303,6 @@ def test_hash_should_not_be_equal( @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestSizeOf: - @pytest.mark.parametrize( "image_dataset_output", [ @@ -256,10 +312,19 @@ class TestSizeOf: ], ) def test_should_size_be_greater_than_normal_object( - self, image_dataset_output: str | Column | Table, device: Device, + self, + image_dataset_output: str | Column | Table, + device: Device, ) -> None: configure_test_with_device(device) - image_dataset = ImageDataset(ImageList.from_files(resolve_resource_path(plane_png_path)), ImageList.from_files(resolve_resource_path(image_dataset_output)) if isinstance(image_dataset_output, str) else image_dataset_output) # type: ignore[type-var] + image_dataset = ImageDataset( + ImageList.from_files(resolve_resource_path(plane_png_path)), + ( + ImageList.from_files(resolve_resource_path(image_dataset_output)) + if isinstance(image_dataset_output, str) + else image_dataset_output + ), + ) # type: ignore[type-var] assert sys.getsizeof(image_dataset) > sys.getsizeof(object()) assert image_dataset._input._tensor.device == _get_device() assert image_dataset._output._tensor.device == _get_device() @@ -267,7 +332,6 @@ def test_should_size_be_greater_than_normal_object( @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestShuffle: - def test_should_be_different_order(self, device: Device) -> None: configure_test_with_device(device) torch.manual_seed(1234) @@ -284,7 +348,6 @@ def test_should_be_different_order(self, device: Device) -> None: @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestBatch: - @pytest.mark.parametrize( ("batch_number", "batch_size"), [ @@ -320,7 +383,6 @@ def test_get_batch_device(self, device: Device) -> None: @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestTableAsTensor: - def test_should_raise_if_not_one_hot_encoded(self, device: Device) -> None: configure_test_with_device(device) with pytest.raises( @@ -350,7 +412,6 @@ def test_eq_should_be_not_implemented(self, device: Device) -> None: @pytest.mark.parametrize("device", get_devices(), ids=get_devices_ids()) class TestColumnAsTensor: - @pytest.mark.parametrize( ("tensor", "one_hot_encoder", "error", "error_msg"), [ diff --git a/tests/safeds/data/tabular/containers/_column/test_repr.py b/tests/safeds/data/tabular/containers/_column/test_repr.py index 0482372a2..7b3b6dee3 100644 --- a/tests/safeds/data/tabular/containers/_column/test_repr.py +++ b/tests/safeds/data/tabular/containers/_column/test_repr.py @@ -7,15 +7,15 @@ [ ( Column("a", []), - "+------+\n" "| a |\n" "| --- |\n" "| null |\n" "+======+\n" "+------+", + "+------+\n| a |\n| --- |\n| null |\n+======+\n+------+", ), ( Column("a", [0]), - "+-----+\n" "| a |\n" "| --- |\n" "| i64 |\n" "+=====+\n" "| 0 |\n" "+-----+", + "+-----+\n| a |\n| --- |\n| i64 |\n+=====+\n| 0 |\n+-----+", ), ( Column("a", [0, "1"]), - "+------+\n" "| a |\n" "| --- |\n" "| str |\n" "+======+\n" "| null |\n" "| 1 |\n" "+------+", + "+------+\n| a |\n| --- |\n| str |\n+======+\n| null |\n| 1 |\n+------+", ), ], ids=[ diff --git a/tests/safeds/data/tabular/containers/_column/test_str.py b/tests/safeds/data/tabular/containers/_column/test_str.py index 0482372a2..7b3b6dee3 100644 --- a/tests/safeds/data/tabular/containers/_column/test_str.py +++ b/tests/safeds/data/tabular/containers/_column/test_str.py @@ -7,15 +7,15 @@ [ ( Column("a", []), - "+------+\n" "| a |\n" "| --- |\n" "| null |\n" "+======+\n" "+------+", + "+------+\n| a |\n| --- |\n| null |\n+======+\n+------+", ), ( Column("a", [0]), - "+-----+\n" "| a |\n" "| --- |\n" "| i64 |\n" "+=====+\n" "| 0 |\n" "+-----+", + "+-----+\n| a |\n| --- |\n| i64 |\n+=====+\n| 0 |\n+-----+", ), ( Column("a", [0, "1"]), - "+------+\n" "| a |\n" "| --- |\n" "| str |\n" "+======+\n" "| null |\n" "| 1 |\n" "+------+", + "+------+\n| a |\n| --- |\n| str |\n+======+\n| null |\n| 1 |\n+------+", ), ], ids=[ diff --git a/tests/safeds/data/tabular/containers/_table/test_hash.py b/tests/safeds/data/tabular/containers/_table/test_hash.py index 1a5144053..e91543403 100644 --- a/tests/safeds/data/tabular/containers/_table/test_hash.py +++ b/tests/safeds/data/tabular/containers/_table/test_hash.py @@ -1,7 +1,5 @@ -from typing import Any - import pytest -from safeds.data.tabular.containers import Row, Table +from safeds.data.tabular.containers import Table @pytest.mark.parametrize( @@ -30,11 +28,7 @@ def test_should_return_same_hash_for_equal_tables(table1: Table, table2: Table) (Table({"col1": [1, 2, 3]}), Table({"col1": ["1", "2", "3"]})), (Table({"col1": [1, 2, 3]}), Table({"col1": [1, 2, 3, 4]})), ], - ids=[ - "different column names", - "different types", - "different number of rows" - ], + ids=["different column names", "different types", "different number of rows"], ) def test_should_return_different_hash_for_unequal_tables(table1: Table, table2: Table) -> None: assert hash(table1) != hash(table2) diff --git a/tests/safeds/data/tabular/containers/_table/test_number_of_columns.py b/tests/safeds/data/tabular/containers/_table/test_number_of_columns.py index 66be91460..347e18dd8 100644 --- a/tests/safeds/data/tabular/containers/_table/test_number_of_columns.py +++ b/tests/safeds/data/tabular/containers/_table/test_number_of_columns.py @@ -5,12 +5,11 @@ @pytest.mark.parametrize( ("table", "expected"), [ - (Table(), 0), (Table(), 0), (Table({"col1": []}), 1), (Table({"col1": [], "col2": []}), 2), ], - ids=["empty", "empty 2", "a column", "2 columns"], + ids=["empty", "a column", "2 columns"], ) def test_should_return_number_of_columns(table: Table, expected: int) -> None: assert table.number_of_columns == expected diff --git a/tests/safeds/data/tabular/containers/_table/test_number_of_rows.py b/tests/safeds/data/tabular/containers/_table/test_number_of_rows.py index db26ebc90..0c3e91214 100644 --- a/tests/safeds/data/tabular/containers/_table/test_number_of_rows.py +++ b/tests/safeds/data/tabular/containers/_table/test_number_of_rows.py @@ -5,12 +5,11 @@ @pytest.mark.parametrize( ("table", "expected"), [ - (Table(), 0), (Table(), 0), (Table({"col1": [1]}), 1), (Table({"col1": [1, 2]}), 2), ], - ids=["empty", "empty 2", "a row", "2 rows"], + ids=["empty", "a row", "2 rows"], ) def test_should_return_number_of_rows(table: Table, expected: int) -> None: assert table.number_of_rows == expected diff --git a/tests/safeds/data/tabular/containers/_table/test_plot_histograms.py b/tests/safeds/data/tabular/containers/_table/test_plot_histograms.py index 2ba38d7c1..2b50b775f 100644 --- a/tests/safeds/data/tabular/containers/_table/test_plot_histograms.py +++ b/tests/safeds/data/tabular/containers/_table/test_plot_histograms.py @@ -61,7 +61,7 @@ "g", "a", ], - } + }, ), ], ids=["one column", "four columns", "two columns with compressed visualization"], diff --git a/tests/safeds/data/tabular/containers/_table/test_remove_duplicate_rows.py b/tests/safeds/data/tabular/containers/_table/test_remove_duplicate_rows.py index 3a3470cd8..186563154 100644 --- a/tests/safeds/data/tabular/containers/_table/test_remove_duplicate_rows.py +++ b/tests/safeds/data/tabular/containers/_table/test_remove_duplicate_rows.py @@ -15,8 +15,9 @@ Table({"A": [1, 4], "B": [2, 5]}), ), (Table(), Table()), + (Table({"col1": []}), Table({"col1": []})), ], - ids=["duplicate rows", "empty"], + ids=["duplicate rows", "empty", "no rows"], ) def test_should_remove_duplicate_rows(table: Table, expected: Table) -> None: result_table = table.remove_duplicate_rows() diff --git a/tests/safeds/data/tabular/containers/_table/test_remove_rows_with_outliers.py b/tests/safeds/data/tabular/containers/_table/test_remove_rows_with_outliers.py index 9ebf37167..70642e501 100644 --- a/tests/safeds/data/tabular/containers/_table/test_remove_rows_with_outliers.py +++ b/tests/safeds/data/tabular/containers/_table/test_remove_rows_with_outliers.py @@ -261,6 +261,4 @@ ) def test_should_remove_rows_with_outliers(table: Table, expected: Table) -> None: updated_table = table.remove_rows_with_outliers() - assert updated_table.schema == expected.schema - assert updated_table.number_of_rows == expected.number_of_rows assert updated_table == expected diff --git a/tests/safeds/data/tabular/containers/_table/test_transform_table.py b/tests/safeds/data/tabular/containers/_table/test_transform_table.py index 5335c9f3a..065ebf457 100644 --- a/tests/safeds/data/tabular/containers/_table/test_transform_table.py +++ b/tests/safeds/data/tabular/containers/_table/test_transform_table.py @@ -1,7 +1,7 @@ import pytest from safeds.data.tabular.containers import Table from safeds.data.tabular.transformation import OneHotEncoder -from safeds.exceptions import TransformerNotFittedError, ColumnNotFoundError +from safeds.exceptions import ColumnNotFoundError, TransformerNotFittedError @pytest.mark.parametrize( diff --git a/tests/safeds/data/tabular/transformation/test_discretizer.py b/tests/safeds/data/tabular/transformation/test_discretizer.py index 11d03718b..63b50a1a8 100644 --- a/tests/safeds/data/tabular/transformation/test_discretizer.py +++ b/tests/safeds/data/tabular/transformation/test_discretizer.py @@ -1,7 +1,7 @@ import pytest from safeds.data.tabular.containers import Table from safeds.data.tabular.transformation import Discretizer -from safeds.exceptions import NonNumericColumnError, OutOfBoundsError, TransformerNotFittedError, ColumnNotFoundError +from safeds.exceptions import ColumnNotFoundError, NonNumericColumnError, OutOfBoundsError, TransformerNotFittedError class TestInit: @@ -265,41 +265,3 @@ def test_should_not_change_original_table(self) -> None: ) assert table == expected - - def test_get_names_of_added_columns(self) -> None: - transformer = Discretizer() - with pytest.raises(TransformerNotFittedError, match=r"The transformer has not been fitted yet."): - transformer.get_names_of_added_columns() - - table = Table( - { - "a": [0.0], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_added_columns() == [] - - def test_get_names_of_changed_columns(self) -> None: - transformer = Discretizer() - with pytest.raises(TransformerNotFittedError, match=r"The transformer has not been fitted yet."): - transformer.get_names_of_changed_columns() - table = Table( - { - "a": [0.0], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_changed_columns() == ["a"] - - def test_get_names_of_removed_columns(self) -> None: - transformer = Discretizer() - with pytest.raises(TransformerNotFittedError, match=r"The transformer has not been fitted yet."): - transformer.get_names_of_removed_columns() - - table = Table( - { - "a": [0.0], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_removed_columns() == [] diff --git a/tests/safeds/data/tabular/transformation/test_label_encoder.py b/tests/safeds/data/tabular/transformation/test_label_encoder.py index 787c73b10..0388bc394 100644 --- a/tests/safeds/data/tabular/transformation/test_label_encoder.py +++ b/tests/safeds/data/tabular/transformation/test_label_encoder.py @@ -1,7 +1,7 @@ import pytest from safeds.data.tabular.containers import Table from safeds.data.tabular.transformation import LabelEncoder -from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, ColumnNotFoundError +from safeds.exceptions import ColumnNotFoundError, ColumnTypeError, TransformerNotFittedError class TestFit: @@ -39,8 +39,9 @@ def test_should_not_change_original_transformer(self) -> None: transformer = LabelEncoder() transformer.fit(table, None) - assert transformer._wrapped_transformer is None assert transformer._column_names is None + assert transformer._mapping is None + assert transformer._inverse_mapping is None class TestTransform: @@ -60,7 +61,7 @@ def test_should_raise_if_column_not_found(self) -> None: }, ) - with pytest.raises(ColumnNotFoundError, match=r"Could not find column\(s\) 'col1, col2'"): + with pytest.raises(ColumnNotFoundError): transformer.transform(table_to_transform) def test_should_raise_if_not_fitted(self) -> None: @@ -75,10 +76,6 @@ def test_should_raise_if_not_fitted(self) -> None: with pytest.raises(TransformerNotFittedError, match=r"The transformer has not been fitted yet."): transformer.transform(table) - def test_should_raise_if_table_contains_no_rows(self) -> None: - with pytest.raises(ValueError, match=r"The LabelEncoder cannot transform the table because it contains 0 rows"): - LabelEncoder().fit(Table({"col1": ["one", "two"]}), ["col1"]).transform(Table({"col1": []})) - class TestIsFitted: def test_should_return_false_before_fitting(self) -> None: @@ -159,44 +156,6 @@ def test_should_not_change_original_table(self) -> None: assert table == expected - def test_get_names_of_added_columns(self) -> None: - transformer = LabelEncoder() - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_added_columns() - - table = Table( - { - "a": ["b"], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_added_columns() == [] - - def test_get_names_of_changed_columns(self) -> None: - transformer = LabelEncoder() - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_changed_columns() - table = Table( - { - "a": ["b"], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_changed_columns() == ["a"] - - def test_get_names_of_removed_columns(self) -> None: - transformer = LabelEncoder() - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_removed_columns() - - table = Table( - { - "a": ["b"], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_removed_columns() == [] - class TestInverseTransform: @pytest.mark.parametrize( @@ -254,18 +213,8 @@ def test_should_raise_if_column_not_found(self) -> None: ).inverse_transform(Table({"col3": [1.0, 0.0]})) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: - with pytest.raises( - NonNumericColumnError, - match=r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\['col1', 'col2'\]", - ): + with pytest.raises(ColumnTypeError): LabelEncoder().fit( Table({"col1": ["one", "two"], "col2": ["three", "four"]}), ["col1", "col2"], ).inverse_transform(Table({"col1": ["1", "null"], "col2": ["2", "apple"]})) - - def test_should_raise_if_table_contains_no_rows(self) -> None: - with pytest.raises( - ValueError, - match=r"The LabelEncoder cannot inverse transform the table because it contains 0 rows", - ): - LabelEncoder().fit(Table({"col1": ["one", "two"]}), ["col1"]).inverse_transform(Table({"col1": []})) diff --git a/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py b/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py index 0eec11bee..d95bd97d2 100644 --- a/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py +++ b/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py @@ -4,15 +4,14 @@ from safeds.data.tabular.containers import Table from safeds.data.tabular.transformation import OneHotEncoder from safeds.exceptions import ( + ColumnNotFoundError, NonNumericColumnError, TransformerNotFittedError, - ColumnNotFoundError, ValueNotPresentWhenFittedError, ) class TestEq: - def test_should_be_not_implemented(self) -> None: assert OneHotEncoder().__eq__(Table()) is NotImplemented @@ -311,45 +310,6 @@ def test_should_not_change_original_table(self) -> None: assert table == expected - def test_get_names_of_added_columns(self) -> None: - transformer = OneHotEncoder() - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_added_columns() - - table = Table( - {"a__b": ["c", "d"], "a": ["b__c", "d"], "b": ["a", float("nan")]}, - ) - added_columns = ["a__b__c", "a__b__d", "a__b__c#2", "a__d", "b__a", "b__nan"] - - transformer = transformer.fit(table, None) - assert transformer.get_names_of_added_columns() == added_columns - - def test_get_names_of_changed_columns(self) -> None: - transformer = OneHotEncoder() - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_changed_columns() - - table = Table( - { - "a": ["b"], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_changed_columns() == [] - - def test_get_names_of_removed_columns(self) -> None: - transformer = OneHotEncoder() - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_removed_columns() - - table = Table( - { - "a": ["b"], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_removed_columns() == ["a"] - class TestInverseTransform: @pytest.mark.parametrize( diff --git a/tests/safeds/data/tabular/transformation/test_range_scaler.py b/tests/safeds/data/tabular/transformation/test_range_scaler.py index 1229e12b3..2204e93fa 100644 --- a/tests/safeds/data/tabular/transformation/test_range_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_range_scaler.py @@ -1,12 +1,12 @@ import pytest from safeds.data.tabular.containers import Table from safeds.data.tabular.transformation import RangeScaler -from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, ColumnNotFoundError +from safeds.exceptions import ColumnNotFoundError, ColumnTypeError, TransformerNotFittedError class TestInit: def test_should_raise_value_error(self) -> None: - with pytest.raises(ValueError, match='Parameter "maximum" must be higher than parameter "minimum".'): + with pytest.raises(ValueError, match='Parameter "max_" must be greater than parameter "min_".'): _ = RangeScaler(min_=10, max_=0) @@ -22,15 +22,12 @@ def test_should_raise_if_column_not_found(self) -> None: RangeScaler().fit(table, ["col2", "col3"]) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: - with pytest.raises( - NonNumericColumnError, - match=r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\['col1', 'col2'\]", - ): + with pytest.raises(ColumnTypeError): RangeScaler().fit(Table({"col1": ["a", "b"], "col2": [1, "c"]}), ["col1", "col2"]) def test_should_raise_if_table_contains_no_rows(self) -> None: with pytest.raises(ValueError, match=r"The RangeScaler cannot be fitted because the table contains 0 rows"): - RangeScaler().fit(Table({"col1": []}), ["col1"]) + RangeScaler().fit(Table({"col1": []}), None) def test_should_not_change_original_transformer(self) -> None: table = Table( @@ -42,8 +39,9 @@ def test_should_not_change_original_transformer(self) -> None: transformer = RangeScaler() transformer.fit(table, None) - assert transformer._wrapped_transformer is None assert transformer._column_names is None + assert transformer._data_min is None + assert transformer._data_max is None class TestTransform: @@ -79,18 +77,11 @@ def test_should_raise_if_not_fitted(self) -> None: transformer.transform(table) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: - with pytest.raises( - NonNumericColumnError, - match=r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\['col1', 'col2'\]", - ): + with pytest.raises(ColumnTypeError): RangeScaler().fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]}), ["col1", "col2"]).transform( Table({"col1": ["a", "b", "c"], "col2": ["c", "d", "e"]}), ) - def test_should_raise_if_table_contains_no_rows(self) -> None: - with pytest.raises(ValueError, match=r"The RangeScaler cannot transform the table because it contains 0 rows"): - RangeScaler().fit(Table({"col1": [1, 2, 3]}), ["col1"]).transform(Table({"col1": []})) - class TestIsFitted: def test_should_return_false_before_fitting(self) -> None: @@ -218,44 +209,6 @@ def test_should_not_change_original_table(self) -> None: assert table == expected - def test_get_names_of_added_columns(self) -> None: - transformer = RangeScaler() - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_added_columns() - - table = Table( - { - "a": [0.0], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_added_columns() == [] - - def test_get_names_of_changed_columns(self) -> None: - transformer = RangeScaler() - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_changed_columns() - table = Table( - { - "a": [0.0], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_changed_columns() == ["a"] - - def test_get_names_of_removed_columns(self) -> None: - transformer = RangeScaler() - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_removed_columns() - - table = Table( - { - "a": [0.0], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_removed_columns() == [] - class TestInverseTransform: @pytest.mark.parametrize( @@ -311,14 +264,7 @@ def test_should_raise_if_column_not_found(self) -> None: ) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: - with pytest.raises( - NonNumericColumnError, - match=r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\['col1', 'col2'\]", - ): + with pytest.raises(ColumnTypeError): RangeScaler().fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]}), ["col1", "col2"]).inverse_transform( Table({"col1": ["1", "2", "three"], "col2": [1, 2, "four"]}), ) - - def test_should_raise_if_table_contains_no_rows(self) -> None: - with pytest.raises(ValueError, match=r"The RangeScaler cannot transform the table because it contains 0 rows"): - RangeScaler().fit(Table({"col1": [1, 2, 3]}), ["col1"]).inverse_transform(Table({"col1": []})) diff --git a/tests/safeds/data/tabular/transformation/test_simple_imputer.py b/tests/safeds/data/tabular/transformation/test_simple_imputer.py index 03c0b7e23..821018dfe 100644 --- a/tests/safeds/data/tabular/transformation/test_simple_imputer.py +++ b/tests/safeds/data/tabular/transformation/test_simple_imputer.py @@ -5,7 +5,7 @@ from safeds.data.tabular.containers import Table from safeds.data.tabular.transformation import SimpleImputer from safeds.data.tabular.transformation._simple_imputer import _Mode -from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, ColumnNotFoundError +from safeds.exceptions import ColumnNotFoundError, NonNumericColumnError, TransformerNotFittedError def strategies() -> list[SimpleImputer.Strategy]: @@ -21,25 +21,25 @@ def strategies() -> list[SimpleImputer.Strategy]: The list of classifiers to test. """ return [ - SimpleImputer.Strategy.Constant(2), - SimpleImputer.Strategy.Mean(), - SimpleImputer.Strategy.Median(), - SimpleImputer.Strategy.Mode(), + SimpleImputer.Strategy.constant(2), + SimpleImputer.Strategy.mean(), + SimpleImputer.Strategy.median(), + SimpleImputer.Strategy.mode(), ] class TestStrategyClass: def test_should_be_able_to_get_value_of_constant_strategy(self) -> None: - assert SimpleImputer.Strategy.Constant(1).value == 1 # type: ignore[attr-defined] + assert SimpleImputer.Strategy.constant(1).value == 1 # type: ignore[attr-defined] @pytest.mark.parametrize( ("strategy", "type_", "expected"), [ - (SimpleImputer.Strategy.Constant(0), SimpleImputer.Strategy.Constant, True), - (SimpleImputer.Strategy.Mean(), SimpleImputer.Strategy.Mean, True), - (SimpleImputer.Strategy.Median(), SimpleImputer.Strategy.Median, True), - (SimpleImputer.Strategy.Mode(), SimpleImputer.Strategy.Mode, True), - (SimpleImputer.Strategy.Mode(), SimpleImputer.Strategy.Mean, False), + (SimpleImputer.Strategy.constant(0), SimpleImputer.Strategy.constant, True), + (SimpleImputer.Strategy.mean(), SimpleImputer.Strategy.mean, True), + (SimpleImputer.Strategy.median(), SimpleImputer.Strategy.median, True), + (SimpleImputer.Strategy.mode(), SimpleImputer.Strategy.mode, True), + (SimpleImputer.Strategy.mode(), SimpleImputer.Strategy.mean, False), ], ) def test_should_be_able_to_use_strategy_in_isinstance( @@ -114,7 +114,7 @@ def test_should_return_different_hash_for_unequal_strategy( class TestSizeof: @pytest.mark.parametrize( "strategy", - ([SimpleImputer.Strategy.Constant(1)]), + ([SimpleImputer.Strategy.constant(1)]), ids=lambda x: x.__class__.__name__, ) def test_sizeof_strategy( @@ -127,15 +127,17 @@ class TestStr: @pytest.mark.parametrize( ("strategy", "expected"), [ - (SimpleImputer.Strategy.Constant(0), "Constant(0)"), - (SimpleImputer.Strategy.Mean(), "Mean"), - (SimpleImputer.Strategy.Median(), "Median"), - (SimpleImputer.Strategy.Mode(), "Mode"), + (SimpleImputer.Strategy.constant(0), "Constant(0)"), + (SimpleImputer.Strategy.mean(), "Mean"), + (SimpleImputer.Strategy.median(), "Median"), + (SimpleImputer.Strategy.mode(), "Mode"), ], ids=lambda x: x.__class__.__name__, ) def test_should_return_correct_string_representation( - self, strategy: SimpleImputer.Strategy, expected: str + self, + strategy: SimpleImputer.Strategy, + expected: str, ) -> None: assert str(strategy) == expected @@ -157,7 +159,7 @@ class TestValueToReplaceProperty: ) def test_should_return_correct_value_to_replace(self, value_to_replace: float | str | None) -> None: assert ( - SimpleImputer(SimpleImputer.Strategy.Mode(), value_to_replace=value_to_replace).value_to_replace + SimpleImputer(SimpleImputer.Strategy.mode(), value_to_replace=value_to_replace).value_to_replace == value_to_replace ) @@ -182,8 +184,8 @@ def test_should_raise_if_table_contains_no_rows(self, strategy: SimpleImputer.St @pytest.mark.parametrize( ("table", "col_names", "strategy"), [ - (Table({"col1": [1, None, "ok"], "col2": [1, 2, "3"]}), ["col1", "col2"], SimpleImputer.Strategy.Mean()), - (Table({"col1": [1, None, "ok"], "col2": [1, 2, "3"]}), ["col1", "col2"], SimpleImputer.Strategy.Median()), + (Table({"col1": [1, None, "ok"], "col2": [1, 2, "3"]}), ["col1", "col2"], SimpleImputer.Strategy.mean()), + (Table({"col1": [1, None, "ok"], "col2": [1, 2, "3"]}), ["col1", "col2"], SimpleImputer.Strategy.median()), ], ids=["Strategy Mean", "Strategy Median"], ) @@ -220,7 +222,7 @@ def test_should_warn_if_multiple_mode_values(self, table: Table, most_frequent: rf" values:\n{most_frequent}" ), ): - SimpleImputer(SimpleImputer.Strategy.Mode()).fit(table, None) + SimpleImputer(SimpleImputer.Strategy.mode()).fit(table, None) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) def test_should_not_change_original_transformer(self, strategy: SimpleImputer.Strategy) -> None: @@ -316,7 +318,7 @@ class TestFitAndTransform: }, ), None, - SimpleImputer.Strategy.Constant(0.0), + SimpleImputer.Strategy.constant(0.0), None, Table( { @@ -331,7 +333,7 @@ class TestFitAndTransform: }, ), None, - SimpleImputer.Strategy.Mean(), + SimpleImputer.Strategy.mean(), None, Table( { @@ -346,7 +348,7 @@ class TestFitAndTransform: }, ), None, - SimpleImputer.Strategy.Median(), + SimpleImputer.Strategy.median(), None, Table( { @@ -361,7 +363,7 @@ class TestFitAndTransform: }, ), None, - SimpleImputer.Strategy.Mode(), + SimpleImputer.Strategy.mode(), None, Table( { @@ -377,7 +379,7 @@ class TestFitAndTransform: }, ), ["a"], - SimpleImputer.Strategy.Constant(0.0), + SimpleImputer.Strategy.constant(0.0), None, Table( { @@ -393,7 +395,7 @@ class TestFitAndTransform: }, ), ["a"], - SimpleImputer.Strategy.Mode(), + SimpleImputer.Strategy.mode(), None, Table({"a": [1.0, 1.0, 2.0, 2.0, 1.0]}), ), @@ -404,7 +406,7 @@ class TestFitAndTransform: }, ), None, - SimpleImputer.Strategy.Constant(1.0), + SimpleImputer.Strategy.constant(1.0), 0.0, Table( { @@ -462,47 +464,3 @@ def test_should_not_change_original_table(self, strategy: SimpleImputer.Strategy ) assert table == expected - - @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_get_names_of_added_columns(self, strategy: SimpleImputer.Strategy) -> None: - transformer = SimpleImputer(strategy=strategy) - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_added_columns() - - table = Table( - { - "a": [1, None], - "b": [1, 1], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_added_columns() == [] - - @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_get_names_of_changed_columns(self, strategy: SimpleImputer.Strategy) -> None: - transformer = SimpleImputer(strategy=strategy) - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_changed_columns() - table = Table( - { - "a": [1, None], - "b": [1, 1], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_changed_columns() == ["a", "b"] - - @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) - def test_get_names_of_removed_columns(self, strategy: SimpleImputer.Strategy) -> None: - transformer = SimpleImputer(strategy=strategy) - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_removed_columns() - - table = Table( - { - "a": [1, None], - "b": [1, 1], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_removed_columns() == [] diff --git a/tests/safeds/data/tabular/transformation/test_standard_scaler.py b/tests/safeds/data/tabular/transformation/test_standard_scaler.py index 7d745f46e..911e82bca 100644 --- a/tests/safeds/data/tabular/transformation/test_standard_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_standard_scaler.py @@ -1,9 +1,9 @@ import pytest from safeds.data.tabular.containers import Table from safeds.data.tabular.transformation import StandardScaler -from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, ColumnNotFoundError +from safeds.exceptions import ColumnNotFoundError, ColumnTypeError, TransformerNotFittedError -from tests.helpers import assert_that_tables_are_close +from tests.helpers import assert_tables_equal class TestFit: @@ -18,10 +18,7 @@ def test_should_raise_if_column_not_found(self) -> None: StandardScaler().fit(table, ["col2", "col3"]) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: - with pytest.raises( - NonNumericColumnError, - match=r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\['col1', 'col2'\]", - ): + with pytest.raises(ColumnTypeError): StandardScaler().fit( Table({"col1": ["one", "two", "apple"], "col2": ["three", "four", "banana"]}), ["col1", "col2"], @@ -29,7 +26,7 @@ def test_should_raise_if_table_contains_non_numerical_data(self) -> None: def test_should_raise_if_table_contains_no_rows(self) -> None: with pytest.raises(ValueError, match=r"The StandardScaler cannot be fitted because the table contains 0 rows"): - StandardScaler().fit(Table({"col1": []}), ["col1"]) + StandardScaler().fit(Table({"col1": []}), None) def test_should_not_change_original_transformer(self) -> None: table = Table( @@ -41,8 +38,9 @@ def test_should_not_change_original_transformer(self) -> None: transformer = StandardScaler() transformer.fit(table, None) - assert transformer._wrapped_transformer is None assert transformer._column_names is None + assert transformer._data_mean is None + assert transformer._data_standard_deviation is None class TestTransform: @@ -78,21 +76,11 @@ def test_should_raise_if_not_fitted(self) -> None: transformer.transform(table) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: - with pytest.raises( - NonNumericColumnError, - match=r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\['col1', 'col2'\]", - ): + with pytest.raises(ColumnTypeError): StandardScaler().fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]}), ["col1", "col2"]).transform( Table({"col1": ["a", "b", "c"], "col2": ["b", "c", "e"]}), ) - def test_should_raise_if_table_contains_no_rows(self) -> None: - with pytest.raises( - ValueError, - match=r"The StandardScaler cannot transform the table because it contains 0 rows", - ): - StandardScaler().fit(Table({"col1": [1, 2, 3]}), ["col1"]).transform(Table({"col1": []})) - class TestIsFitted: def test_should_return_false_before_fitting(self) -> None: @@ -141,7 +129,7 @@ def test_should_return_fitted_transformer_and_transformed_table( ) -> None: fitted_transformer, transformed_table = StandardScaler().fit_and_transform(table, column_names) assert fitted_transformer.is_fitted - assert_that_tables_are_close(transformed_table, expected) + assert_tables_equal(transformed_table, expected) def test_should_not_change_original_table(self) -> None: table = Table( @@ -160,44 +148,6 @@ def test_should_not_change_original_table(self) -> None: assert table == expected - def test_get_names_of_added_columns(self) -> None: - transformer = StandardScaler() - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_added_columns() - - table = Table( - { - "a": [0.0], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_added_columns() == [] - - def test_get_names_of_changed_columns(self) -> None: - transformer = StandardScaler() - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_changed_columns() - table = Table( - { - "a": [0.0], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_changed_columns() == ["a"] - - def test_get_names_of_removed_columns(self) -> None: - transformer = StandardScaler() - with pytest.raises(TransformerNotFittedError): - transformer.get_names_of_removed_columns() - - table = Table( - { - "a": [0.0], - }, - ) - transformer = transformer.fit(table, None) - assert transformer.get_names_of_removed_columns() == [] - class TestInverseTransform: @pytest.mark.parametrize( @@ -233,7 +183,7 @@ def test_should_not_change_transformed_table(self) -> None: }, ) - assert_that_tables_are_close(transformed_table, expected) + assert_tables_equal(transformed_table, expected) def test_should_raise_if_not_fitted(self) -> None: table = Table( @@ -254,17 +204,7 @@ def test_should_raise_if_column_not_found(self) -> None: ) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: - with pytest.raises( - NonNumericColumnError, - match=r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\['col1', 'col2'\]", - ): + with pytest.raises(ColumnTypeError): StandardScaler().fit(Table({"col1": [1, 2, 4], "col2": [2, 3, 4]}), ["col1", "col2"]).inverse_transform( Table({"col1": ["one", "two", "apple"], "col2": ["three", "four", "banana"]}), ) - - def test_should_raise_if_table_contains_no_rows(self) -> None: - with pytest.raises( - ValueError, - match=r"The StandardScaler cannot transform the table because it contains 0 rows", - ): - StandardScaler().fit(Table({"col1": [1, 2, 4]}), ["col1"]).inverse_transform(Table({"col1": []})) diff --git a/tests/safeds/data/tabular/transformation/test_table_transformer.py b/tests/safeds/data/tabular/transformation/test_table_transformer.py index fb4640119..ff80701a2 100644 --- a/tests/safeds/data/tabular/transformation/test_table_transformer.py +++ b/tests/safeds/data/tabular/transformation/test_table_transformer.py @@ -66,7 +66,7 @@ def transformers() -> list[TableTransformer]: transformers_numeric() + transformers_non_numeric() + [ - SimpleImputer(strategy=SimpleImputer.Strategy.Mode()), + SimpleImputer(strategy=SimpleImputer.Strategy.mode()), ] ) @@ -175,6 +175,6 @@ def test_should_return_different_hash_for_imputer_fit( transformer2: TableTransformer, valid_data_imputer: Table, ) -> None: - transformer1 = SimpleImputer(strategy=SimpleImputer.Strategy.Mode()) + transformer1 = SimpleImputer(strategy=SimpleImputer.Strategy.mode()) transformer1_fit = transformer1.fit(valid_data_imputer, ["col1"]) assert hash(transformer2) != hash(transformer1_fit) diff --git a/tests/safeds/ml/nn/converters/test_input_converter_image.py b/tests/safeds/ml/nn/converters/test_input_converter_image.py index 15eba9a50..46822d0d0 100644 --- a/tests/safeds/ml/nn/converters/test_input_converter_image.py +++ b/tests/safeds/ml/nn/converters/test_input_converter_image.py @@ -13,7 +13,6 @@ class TestIsFitDataValid: - @pytest.mark.parametrize( ("image_dataset_valid", "image_dataset_invalid"), [ @@ -78,7 +77,6 @@ def test_should_return_false_if_fit_data_is_invalid( class TestEq: - @pytest.mark.parametrize( ("input_conversion_image1", "input_conversion_image2"), [(InputConversionImage(ImageSize(1, 2, 3)), InputConversionImage(ImageSize(1, 2, 3)))], @@ -114,7 +112,6 @@ def test_should_be_not_implemented(self) -> None: class TestHash: - @pytest.mark.parametrize( ("input_conversion_image1", "input_conversion_image2"), [(InputConversionImage(ImageSize(1, 2, 3)), InputConversionImage(ImageSize(1, 2, 3)))], @@ -145,7 +142,6 @@ def test_hash_should_not_be_equal( class TestSizeOf: - @pytest.mark.parametrize("input_conversion_image", [InputConversionImage(ImageSize(1, 2, 3))]) def test_should_size_be_greater_than_normal_object(self, input_conversion_image: InputConversionImage) -> None: assert sys.getsizeof(input_conversion_image) > sys.getsizeof(object()) diff --git a/tests/safeds/ml/nn/converters/test_output_converter_image.py b/tests/safeds/ml/nn/converters/test_output_converter_image.py index be751a1ca..584eae4cc 100644 --- a/tests/safeds/ml/nn/converters/test_output_converter_image.py +++ b/tests/safeds/ml/nn/converters/test_output_converter_image.py @@ -15,7 +15,6 @@ class TestDataConversionImage: - @pytest.mark.parametrize( ("output_conversion", "kwargs"), [ @@ -33,7 +32,6 @@ def test_should_raise_if_input_data_is_multi_size( output_conversion._data_conversion(input_data=_MultiSizeImageList(), output_data=torch.empty(1), **kwargs) class TestEq: - @pytest.mark.parametrize( ("output_conversion_image1", "output_conversion_image2"), [ @@ -65,7 +63,6 @@ def test_should_be_not_implemented(self) -> None: assert output_conversion_image_to_column.__eq__(output_conversion_image_to_image) is NotImplemented class TestHash: - @pytest.mark.parametrize( ("output_conversion_image1", "output_conversion_image2"), [ @@ -90,7 +87,6 @@ def test_hash_should_not_be_equal(self) -> None: assert hash(output_conversion_image_to_table) != hash(output_conversion_image_to_column) class TestSizeOf: - @pytest.mark.parametrize( "output_conversion_image", [ @@ -107,7 +103,6 @@ def test_should_size_be_greater_than_normal_object( class TestOutputConversionImageToColumn: - def test_should_raise_if_column_name_not_set(self) -> None: with pytest.raises( ValueError, @@ -132,7 +127,6 @@ def test_should_raise_if_one_hot_encoder_not_set(self) -> None: class TestOutputConversionImageToTable: - def test_should_raise_if_column_names_not_set(self) -> None: with pytest.raises( ValueError, diff --git a/tests/safeds/ml/nn/converters/test_output_converter_time_series.py b/tests/safeds/ml/nn/converters/test_output_converter_time_series.py index f85266d40..131c5c369 100644 --- a/tests/safeds/ml/nn/converters/test_output_converter_time_series.py +++ b/tests/safeds/ml/nn/converters/test_output_converter_time_series.py @@ -38,7 +38,6 @@ def test_output_conversion_time_series_2() -> None: class TestEq: - @pytest.mark.parametrize( ("output_conversion_ts1", "output_conversion_ts2"), [ @@ -70,7 +69,6 @@ def test_should_not_be_equal( class TestHash: - @pytest.mark.parametrize( ("output_conversion_ts1", "output_conversion_ts2"), [ @@ -96,7 +94,6 @@ def test_hash_should_not_be_equal(self) -> None: class TestSizeOf: - @pytest.mark.parametrize( "output_conversion_ts", [ diff --git a/tests/safeds/ml/nn/layers/test_convolutional2d_layer.py b/tests/safeds/ml/nn/layers/test_convolutional2d_layer.py index fdc234a52..4b8ac362e 100644 --- a/tests/safeds/ml/nn/layers/test_convolutional2d_layer.py +++ b/tests/safeds/ml/nn/layers/test_convolutional2d_layer.py @@ -8,7 +8,6 @@ class TestConvolutional2DLayer: - @pytest.mark.parametrize( ("activation_function", "activation_layer"), [("sigmoid", nn.Sigmoid), ("relu", nn.ReLU), ("softmax", nn.Softmax)], @@ -159,7 +158,6 @@ def test_should_raise_if_input_size_is_set_with_int( layer._set_input_size(1) class TestEq: - @pytest.mark.parametrize( ("conv2dlayer1", "conv2dlayer2"), [ @@ -214,7 +212,6 @@ def test_should_be_not_implemented(self) -> None: assert convtranspose2dlayer.__eq__(conv2dlayer) is NotImplemented class TestHash: - @pytest.mark.parametrize( ("conv2dlayer1", "conv2dlayer2"), [ @@ -265,7 +262,6 @@ def test_hash_should_not_be_equal( assert hash(conv2dlayer1) != hash(conv2dlayer2) class TestSizeOf: - @pytest.mark.parametrize( "conv2dlayer", [ diff --git a/tests/safeds/ml/nn/layers/test_flatten_layer.py b/tests/safeds/ml/nn/layers/test_flatten_layer.py index 9e998ddc1..b034a7620 100644 --- a/tests/safeds/ml/nn/layers/test_flatten_layer.py +++ b/tests/safeds/ml/nn/layers/test_flatten_layer.py @@ -8,7 +8,6 @@ class TestFlattenLayer: - def test_should_create_flatten_layer(self) -> None: layer = FlattenLayer() input_size = ImageSize(10, 20, 30, _ignore_invalid_channel=True) @@ -33,7 +32,6 @@ def test_should_raise_if_input_size_is_set_with_int(self) -> None: layer._set_input_size(1) class TestEq: - def test_should_be_equal(self) -> None: assert FlattenLayer() == FlattenLayer() @@ -41,11 +39,9 @@ def test_should_be_not_implemented(self) -> None: assert FlattenLayer().__eq__(Table()) is NotImplemented class TestHash: - def test_hash_should_be_equal(self) -> None: assert hash(FlattenLayer()) == hash(FlattenLayer()) class TestSizeOf: - def test_should_size_be_greater_than_normal_object(self) -> None: assert sys.getsizeof(FlattenLayer()) > sys.getsizeof(object()) diff --git a/tests/safeds/ml/nn/layers/test_lstm_layer.py b/tests/safeds/ml/nn/layers/test_lstm_layer.py index 386cfd9db..dd23d3191 100644 --- a/tests/safeds/ml/nn/layers/test_lstm_layer.py +++ b/tests/safeds/ml/nn/layers/test_lstm_layer.py @@ -29,7 +29,6 @@ def test_should_raise_if_input_size_out_of_bounds(input_size: int) -> None: ids=["one", "twenty"], ) def test_should_raise_if_input_size_doesnt_match(input_size: int) -> None: - assert LSTMLayer(output_size=1, input_size=input_size).input_size == input_size diff --git a/tests/safeds/ml/nn/layers/test_pooling2d_layer.py b/tests/safeds/ml/nn/layers/test_pooling2d_layer.py index e5dc243f5..4c5fcb6e3 100644 --- a/tests/safeds/ml/nn/layers/test_pooling2d_layer.py +++ b/tests/safeds/ml/nn/layers/test_pooling2d_layer.py @@ -10,7 +10,6 @@ class TestPooling2DLayer: - @pytest.mark.parametrize( ("strategy", "torch_layer"), [ @@ -58,7 +57,6 @@ def test_should_raise_if_input_size_is_set_with_int(self, strategy: Literal["max layer._set_input_size(1) class TestEq: - @pytest.mark.parametrize( ("pooling_2d_layer_1", "pooling_2d_layer_2"), [ @@ -114,7 +112,6 @@ def test_should_be_not_implemented(self) -> None: assert avg_pooling_2d_layer.__eq__(max_pooling_2d_layer) is NotImplemented class TestHash: - @pytest.mark.parametrize( ("pooling_2d_layer_1", "pooling_2d_layer_2"), [ @@ -161,7 +158,6 @@ def test_hash_should_not_be_equal( assert hash(pooling_2d_layer_1) != hash(pooling_2d_layer_2) class TestSizeOf: - @pytest.mark.parametrize( "pooling_2d_layer", [ diff --git a/tests/safeds/ml/nn/test_cnn_workflow.py b/tests/safeds/ml/nn/test_cnn_workflow.py index c4e581a3e..f927cd88b 100644 --- a/tests/safeds/ml/nn/test_cnn_workflow.py +++ b/tests/safeds/ml/nn/test_cnn_workflow.py @@ -36,7 +36,6 @@ class TestImageToTableClassifier: - @pytest.mark.parametrize( ("seed", "device", "prediction_label"), [ @@ -104,7 +103,6 @@ def test_should_train_and_predict_model( class TestImageToColumnClassifier: - @pytest.mark.parametrize( ("seed", "device", "prediction_label"), [ @@ -171,7 +169,6 @@ def test_should_train_and_predict_model( class TestImageToImageRegressor: - @pytest.mark.parametrize( ("seed", "device"), [