feat: specify partial order in label encoder (#763)

Closes #639 ### Summary of Changes * Optionally specify a partial order of labels in the label encoder * Performance: Implement RangeScaler, StandardScaler, LabelEncoder with polars --------- Co-authored-by: megalinter-bot <[email protected]>
Safe-DS · May 14, 2024 · 6fbe537 · 6fbe537
1 parent 74cc701
commit 6fbe537
Show file tree

Hide file tree

Showing 60 changed files with 672 additions and 1,242 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -78,7 +78,7 @@ omit = [
 ]
 
 [tool.pytest.ini_options]
-addopts = "--snapshot-warn-unused"
+addopts = "--snapshot-warn-unused --tb=short"
 filterwarnings = [
     "ignore:Deprecated call to `pkg_resources.declare_namespace",
     "ignore:Jupyter is migrating its paths to use standard platformdirs"

diff --git a/src/safeds/_validation/_check_columns_are_numeric.py b/src/safeds/_validation/_check_columns_are_numeric.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from safeds.exceptions import ColumnTypeError
+
+if TYPE_CHECKING:
+    from collections.abc import Container
+
+    from safeds.data.tabular.containers import Table
+    from safeds.data.tabular.typing import Schema
+
+
+def _check_columns_are_numeric(
+    table_or_schema: Table | Schema,
+    column_names: str | list[str],
+    *,
+    operation: str = "do a numeric operation",
+) -> None:
+    """
+    Check if the columns with the specified names are numeric and raise an error if they are not.
+
+    Missing columns are ignored. Use `_check_columns_exist` to check for missing columns.
+
+    Parameters
+    ----------
+    table_or_schema:
+        The table or schema to check.
+    column_names:
+        The column names to check.
+    operation:
+        The operation that is performed on the columns. This is used in the error message.
+
+    Raises
+    ------
+    ColumnTypeError
+        If a column exists but is not numeric.
+    """
+    from safeds.data.tabular.containers import Table  # circular import
+
+    if isinstance(table_or_schema, Table):
+        table_or_schema = table_or_schema.schema
+    if isinstance(column_names, str):
+        column_names = [column_names]
+
+    if len(column_names) > 1:
+        # Create a set for faster containment checks
+        known_names: Container = set(table_or_schema.column_names)
+    else:
+        known_names = table_or_schema.column_names
+
+    non_numeric_names = [
+        name for name in column_names if name in known_names and not table_or_schema.get_column_type(name).is_numeric
+    ]
+    if non_numeric_names:
+        message = _build_error_message(non_numeric_names, operation)
+        raise ColumnTypeError(message)
+
+
+def _build_error_message(non_numeric_names: list[str], operation: str) -> str:
+    return f"Tried to {operation} on non-numeric columns {non_numeric_names}."
diff --git a/src/safeds/data/image/containers/_image.py b/src/safeds/data/image/containers/_image.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import io
-import os.path
 import sys
 import warnings
 from pathlib import Path
@@ -79,9 +78,12 @@ def from_file(path: str | Path) -> Image:
         """
         from torchvision.io import read_image
 
+        if isinstance(path, str):
+            path = Path(path)
+
         _init_default_device()
 
-        if not os.path.isfile(path):
+        if not path.is_file():
             raise FileNotFoundError(f"No such file or directory: '{path}'")
 
         return Image(image_tensor=read_image(str(path)).to(_get_device()))

diff --git a/src/safeds/data/image/containers/_image_list.py b/src/safeds/data/image/containers/_image_list.py
@@ -283,7 +283,6 @@ def from_files(
             return image_list
 
     class _FromFileThreadPackage:
-
         def __init__(
             self,
             im_files: list[str],
@@ -323,7 +322,6 @@ def __len__(self) -> int:
             return len(self._im_files)
 
     class _FromImageThread(Thread):
-
         def __init__(self, packages: list[ImageList._FromFileThreadPackage]) -> None:
             super().__init__()
             self._packages = packages

diff --git a/src/safeds/data/image/containers/_multi_size_image_list.py b/src/safeds/data/image/containers/_multi_size_image_list.py
@@ -66,7 +66,7 @@ def _create_from_single_sized_image_lists(single_size_image_lists: list[_SingleS
                     single_size_image_list._indices_to_tensor_positions.keys(),
                     [image_size] * len(single_size_image_list),
                     strict=False,
-                )
+                ),
             )
             if max_channel is None:
                 max_channel = single_size_image_list.channel
@@ -80,7 +80,7 @@ def _create_from_single_sized_image_lists(single_size_image_lists: list[_SingleS
             for size in image_list._image_list_dict:
                 if max_channel is not None and image_list._image_list_dict[size].channel != max_channel:
                     image_list._image_list_dict[size] = image_list._image_list_dict[size].change_channel(
-                        int(max_channel)
+                        int(max_channel),
                     )
         return image_list
 

diff --git a/src/safeds/data/image/containers/_single_size_image_list.py b/src/safeds/data/image/containers/_single_size_image_list.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-from safeds._config import _init_default_device, _get_device
+from safeds._config import _get_device, _init_default_device
 from safeds._utils import _structural_hash
 from safeds.data.image._utils._image_transformation_error_and_warning_checks import (
     _check_add_noise_errors,
@@ -82,7 +82,12 @@ def _create_image_list_from_files(
         image_list = _SingleSizeImageList()
 
         images_tensor = torch.empty(
-            number_of_images, max_channel, height, width, dtype=torch.uint8, device=_get_device()
+            number_of_images,
+            max_channel,
+            height,
+            width,
+            dtype=torch.uint8,
+            device=_get_device(),
         )
 
         thread_packages: list[ImageList._FromFileThreadPackage] = []

diff --git a/src/safeds/data/labeled/containers/_image_dataset.py b/src/safeds/data/labeled/containers/_image_dataset.py
@@ -89,7 +89,7 @@ def __init__(self, input_data: ImageList, output_data: T, batch_size: int = 1, s
             _output_size: int | ImageSize = output_data.number_of_columns
         elif isinstance(output_data, Column):
             _column_as_tensor = _ColumnAsTensor(output_data)
-            _output_size = len(_column_as_tensor._one_hot_encoder.get_names_of_added_columns())
+            _output_size = len(_column_as_tensor._one_hot_encoder._get_names_of_added_columns())
             _output = _column_as_tensor
         elif isinstance(output_data, _SingleSizeImageList):
             _output = output_data._clone()._as_single_size_image_list()
@@ -289,7 +289,6 @@ def shuffle(self) -> ImageDataset[T]:
 
 
 class _TableAsTensor:
-
     def __init__(self, table: Table) -> None:
         import torch
 
@@ -345,7 +344,6 @@ def _to_table(self) -> Table:
 
 
 class _ColumnAsTensor:
-
     def __init__(self, column: Column) -> None:
         import torch
 
@@ -359,6 +357,8 @@ def __init__(self, column: Column) -> None:
                 message=rf"The columns \['{self._column_name}'\] contain numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical values",
                 category=UserWarning,
             )
+            # TODO: should not one-hot-encode the target. label encoding without order is sufficient. should also not
+            #  be done automatically?
             self._one_hot_encoder = OneHotEncoder().fit(column_as_table, [self._column_name])
         self._tensor = torch.Tensor(self._one_hot_encoder.transform(column_as_table)._data_frame.to_torch()).to(
             _get_device(),
@@ -394,9 +394,9 @@ def _from_tensor(tensor: Tensor, column_name: str, one_hot_encoder: OneHotEncode
             raise ValueError(f"Tensor has an invalid amount of dimensions. Needed 2 dimensions but got {tensor.dim()}.")
         if not one_hot_encoder.is_fitted:
             raise TransformerNotFittedError
-        if tensor.size(dim=1) != len(one_hot_encoder.get_names_of_added_columns()):
+        if tensor.size(dim=1) != len(one_hot_encoder._get_names_of_added_columns()):
             raise ValueError(
-                f"Tensor and one_hot_encoder have different amounts of classes ({tensor.size(dim=1)}!={len(one_hot_encoder.get_names_of_added_columns())}).",
+                f"Tensor and one_hot_encoder have different amounts of classes ({tensor.size(dim=1)}!={len(one_hot_encoder._get_names_of_added_columns())}).",
             )
         table_as_tensor = _ColumnAsTensor.__new__(_ColumnAsTensor)
         table_as_tensor._tensor = tensor
@@ -406,6 +406,6 @@ def _from_tensor(tensor: Tensor, column_name: str, one_hot_encoder: OneHotEncode
 
     def _to_column(self) -> Column:
         table = Table(
-            dict(zip(self._one_hot_encoder.get_names_of_added_columns(), self._tensor.T.tolist(), strict=False)),
+            dict(zip(self._one_hot_encoder._get_names_of_added_columns(), self._tensor.T.tolist(), strict=False)),
         )
         return self._one_hot_encoder.inverse_transform(table).get_column(self._column_name)
diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py
@@ -323,7 +323,7 @@ def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None:
 
         # Implementation
         self._lazy_frame: pl.LazyFrame = pl.LazyFrame(data)
-        self.__data_frame_cache: pl.DataFrame | None = None
+        self.__data_frame_cache: pl.DataFrame | None = None  # Scramble the name to prevent access from outside
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, Table):
@@ -1033,6 +1033,9 @@ def remove_duplicate_rows(self) -> Table:
         |   2 |   5 |
         +-----+-----+
         """
+        if self.number_of_columns == 0:
+            return self  # Workaround for https://github.com/pola-rs/polars/issues/16207
+
         return Table._from_polars_lazy_frame(
             self._lazy_frame.unique(maintain_order=True),
         )
@@ -1221,6 +1224,8 @@ def remove_rows_with_outliers(
         | null |   8 |
         +------+-----+
         """
+        if self.number_of_rows == 0:
+            return self  # polars raises a ComputeError for tables without rows
         if column_names is None:
             column_names = self.column_names
 
@@ -1440,7 +1445,10 @@ def split_rows(
         The first table contains a percentage of the rows specified by `percentage_in_first`, and the second table
         contains the remaining rows.
 
-        **Note:** The original table is not modified.
+        **Notes:**
+
+        - The original table is not modified.
+        - By default, the rows are shuffled before splitting. You can disable this by setting `shuffle` to False.
 
         Parameters
         ----------

diff --git a/src/safeds/data/tabular/transformation/_discretizer.py b/src/safeds/data/tabular/transformation/_discretizer.py
@@ -2,6 +2,7 @@
 
 from typing import TYPE_CHECKING
 
+from safeds._utils import _structural_hash
 from safeds._validation import _check_bounds, _check_columns_exist, _ClosedBound
 from safeds.data.tabular.containers import Table
 from safeds.exceptions import (
@@ -30,13 +31,36 @@ class Discretizer(TableTransformer):
         If the given number_of_bins is less than 2.
     """
 
-    def __init__(self, number_of_bins: int = 5):
+    # ------------------------------------------------------------------------------------------------------------------
+    # Dunder methods
+    # ------------------------------------------------------------------------------------------------------------------
+
+    def __init__(self, number_of_bins: int = 5) -> None:
+        TableTransformer.__init__(self)
+
         _check_bounds("number_of_bins", number_of_bins, lower_bound=_ClosedBound(2))
 
-        self._column_names: list[str] | None = None
         self._wrapped_transformer: sk_KBinsDiscretizer | None = None
         self._number_of_bins = number_of_bins
 
+    def __hash__(self) -> int:
+        return _structural_hash(
+            TableTransformer.__hash__(self),
+            self._number_of_bins,
+        )
+
+    # ------------------------------------------------------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------------------------------------------------------
+
+    @property
+    def number_of_bins(self) -> int:
+        return self._number_of_bins
+
+    # ------------------------------------------------------------------------------------------------------------------
+    # Learning and transformation
+    # ------------------------------------------------------------------------------------------------------------------
+
     def fit(self, table: Table, column_names: list[str] | None) -> Discretizer:
         """
         Learn a transformation for a set of columns in a table.
@@ -137,62 +161,3 @@ def transform(self, table: Table) -> Table:
         return Table._from_polars_lazy_frame(
             table._lazy_frame.update(new_data.lazy()),
         )
-
-    @property
-    def is_fitted(self) -> bool:
-        """Whether the transformer is fitted."""
-        return self._wrapped_transformer is not None
-
-    def get_names_of_added_columns(self) -> list[str]:
-        """
-        Get the names of all new columns that have been added by the Discretizer.
-
-        Returns
-        -------
-        added_columns:
-            A list of names of the added columns, ordered as they will appear in the table.
-
-        Raises
-        ------
-        TransformerNotFittedError
-            If the transformer has not been fitted yet.
-        """
-        if not self.is_fitted:
-            raise TransformerNotFittedError
-        return []
-
-    def get_names_of_changed_columns(self) -> list[str]:
-        """
-         Get the names of all columns that may have been changed by the Discretizer.
-
-        Returns
-        -------
-        changed_columns:
-             The list of (potentially) changed column names, as passed to fit.
-
-        Raises
-        ------
-        TransformerNotFittedError
-            If the transformer has not been fitted yet.
-        """
-        if self._column_names is None:
-            raise TransformerNotFittedError
-        return self._column_names
-
-    def get_names_of_removed_columns(self) -> list[str]:
-        """
-        Get the names of all columns that have been removed by the Discretizer.
-
-        Returns
-        -------
-        removed_columns:
-            A list of names of the removed columns, ordered as they appear in the table the Discretizer was fitted on.
-
-        Raises
-        ------
-        TransformerNotFittedError
-            If the transformer has not been fitted yet.
-        """
-        if not self.is_fitted:
-            raise TransformerNotFittedError
-        return []