Merge branch 'main' into 710-easily-create-a-baseline-model

Safe-DS · Jun 25, 2024 · d5150c6 · d5150c6
2 parents e7579a0 + 3878751
commit d5150c6
Show file tree

Hide file tree

Showing 17 changed files with 1,331 additions and 267 deletions.
diff --git a/docs/tutorials/data_processing.ipynb b/docs/tutorials/data_processing.ipynb
diff --git a/docs/tutorials/data_visualization.ipynb b/docs/tutorials/data_visualization.ipynb
diff --git a/src/safeds/data/image/containers/_image_list.py b/src/safeds/data/image/containers/_image_list.py
@@ -404,7 +404,7 @@ def __contains__(self, item: object) -> bool:
         Returns
         -------
         has_item:
-            Weather the given item is in this image list
+            Whether the given item is in this image list
         """
         return isinstance(item, Image) and self.has_image(item)
 
@@ -524,7 +524,7 @@ def has_image(self, image: Image) -> bool:
         Returns
         -------
         has_image:
-            Weather the given image is in this image list
+            Whether the given image is in this image list
         """
 
     # ------------------------------------------------------------------------------------------------------------------

diff --git a/src/safeds/data/labeled/containers/_image_dataset.py b/src/safeds/data/labeled/containers/_image_dataset.py
@@ -43,7 +43,7 @@ class ImageDataset(Dataset[ImageList, Out_co]):
     batch_size:
         the batch size used for training
     shuffle:
-        weather the data should be shuffled after each epoch of training
+        whether the data should be shuffled after each epoch of training
     """
 
     def __init__(self, input_data: ImageList, output_data: Out_co, batch_size: int = 1, shuffle: bool = False) -> None:
@@ -108,13 +108,13 @@ def __iter__(self) -> ImageDataset:
         return im_ds
 
     def __next__(self) -> tuple[Tensor, Tensor]:
-        if self._next_batch_index * self._batch_size >= len(self._input):
+        if self._next_batch_index * self._batch_size >= len(self._shuffle_tensor_indices):
             raise StopIteration
         self._next_batch_index += 1
         return self._get_batch(self._next_batch_index - 1)
 
     def __len__(self) -> int:
-        return self._input.image_count
+        return len(self._shuffle_tensor_indices)
 
     def __eq__(self, other: object) -> bool:
         """
@@ -138,6 +138,7 @@ def __eq__(self, other: object) -> bool:
             and isinstance(other._output, type(self._output))
             and (self._input == other._input)
             and (self._output == other._output)
+            and (self._shuffle_tensor_indices.tolist() == other._shuffle_tensor_indices.tolist())
         )
 
     def __hash__(self) -> int:
@@ -149,7 +150,13 @@ def __hash__(self) -> int:
         hash:
             the hash value
         """
-        return _structural_hash(self._input, self._output, self._shuffle_after_epoch, self._batch_size)
+        return _structural_hash(
+            self._input,
+            self._output,
+            self._shuffle_after_epoch,
+            self._batch_size,
+            self._shuffle_tensor_indices.tolist(),
+        )
 
     def __sizeof__(self) -> int:
         """
@@ -205,7 +212,7 @@ def get_input(self) -> ImageList:
         input:
             the input data of this dataset
         """
-        return self._sort_image_list_with_shuffle_tensor_indices(self._input)
+        return self._sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(self._input)
 
     def get_output(self) -> Out_co:
         """
@@ -222,19 +229,25 @@ def get_output(self) -> Out_co:
         elif isinstance(output, _ColumnAsTensor):
             return output._to_column(self._shuffle_tensor_indices)  # type: ignore[return-value]
         else:
-            return self._sort_image_list_with_shuffle_tensor_indices(self._output)  # type: ignore[return-value]
+            return self._sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(self._output)  # type: ignore[return-value]
 
-    def _sort_image_list_with_shuffle_tensor_indices(self, image_list: _SingleSizeImageList) -> _SingleSizeImageList:
+    def _sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(
+        self,
+        image_list: _SingleSizeImageList,
+    ) -> _SingleSizeImageList:
         shuffled_image_list = _SingleSizeImageList()
-        shuffled_image_list._tensor = image_list._tensor
-        shuffled_image_list._indices_to_tensor_positions = {
-            index: self._shuffle_tensor_indices[tensor_position].item()
-            for index, tensor_position in image_list._indices_to_tensor_positions.items()
+        tensor_pos = [
+            image_list._indices_to_tensor_positions[shuffled_index]
+            for shuffled_index in sorted(self._shuffle_tensor_indices.tolist())
+        ]
+        temp_pos = {
+            shuffled_index: new_index for new_index, shuffled_index in enumerate(self._shuffle_tensor_indices.tolist())
         }
+        shuffled_image_list._tensor = image_list._tensor[tensor_pos]
         shuffled_image_list._tensor_positions_to_indices = [
-            index
-            for index, _ in sorted(shuffled_image_list._indices_to_tensor_positions.items(), key=lambda item: item[1])
+            new_index for _, new_index in sorted(temp_pos.items(), key=lambda item: item[0])
         ]
+        shuffled_image_list._indices_to_tensor_positions = shuffled_image_list._calc_new_indices_to_tensor_positions()
         return shuffled_image_list
 
     def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[Tensor, Tensor]:
@@ -247,18 +260,18 @@ def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[
 
         _check_bounds("batch_size", batch_size, lower_bound=_ClosedBound(1))
 
-        if batch_number < 0 or batch_size * batch_number >= len(self._input):
+        if batch_number < 0 or batch_size * batch_number >= len(self._shuffle_tensor_indices):
             raise IndexOutOfBoundsError(batch_size * batch_number)
         max_index = (
-            batch_size * (batch_number + 1) if batch_size * (batch_number + 1) < len(self._input) else len(self._input)
+            batch_size * (batch_number + 1)
+            if batch_size * (batch_number + 1) < len(self._shuffle_tensor_indices)
+            else len(self._shuffle_tensor_indices)
         )
         input_tensor = (
             self._input._tensor[
-                self._shuffle_tensor_indices[
-                    [
-                        self._input._indices_to_tensor_positions[index]
-                        for index in range(batch_size * batch_number, max_index)
-                    ]
+                [
+                    self._input._indices_to_tensor_positions[index]
+                    for index in self._shuffle_tensor_indices[batch_size * batch_number : max_index].tolist()
                 ]
             ].to(torch.float32)
             / 255
@@ -267,11 +280,9 @@ def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[
         if isinstance(self._output, _SingleSizeImageList):
             output_tensor = (
                 self._output._tensor[
-                    self._shuffle_tensor_indices[
-                        [
-                            self._output._indices_to_tensor_positions[index]
-                            for index in range(batch_size * batch_number, max_index)
-                        ]
+                    [
+                        self._input._indices_to_tensor_positions[index]
+                        for index in self._shuffle_tensor_indices[batch_size * batch_number : max_index].tolist()
                     ]
                 ].to(torch.float32)
                 / 255
@@ -284,7 +295,7 @@ def shuffle(self) -> ImageDataset[Out_co]:
         """
         Return a new `ImageDataset` with shuffled data.
 
-        The original dataset list is not modified.
+        The original dataset is not modified.
 
         Returns
         -------
@@ -296,10 +307,71 @@ def shuffle(self) -> ImageDataset[Out_co]:
         _init_default_device()
 
         im_dataset: ImageDataset[Out_co] = copy.copy(self)
-        im_dataset._shuffle_tensor_indices = torch.randperm(len(self))
+        im_dataset._shuffle_tensor_indices = self._shuffle_tensor_indices[
+            torch.randperm(len(self._shuffle_tensor_indices))
+        ]
         im_dataset._next_batch_index = 0
         return im_dataset
 
+    def split(
+        self,
+        percentage_in_first: float,
+        *,
+        shuffle: bool = True,
+    ) -> tuple[ImageDataset[Out_co], ImageDataset[Out_co]]:
+        """
+        Create two image datasets by splitting the data of the current dataset.
+
+        The first dataset contains a percentage of the data specified by `percentage_in_first`, and the second dataset
+        contains the remaining data.
+
+        The original dataset is not modified.
+        By default, the data is shuffled before splitting. You can disable this by setting `shuffle` to False.
+
+        Parameters
+        ----------
+        percentage_in_first:
+            The percentage of data to include in the first dataset. Must be between 0 and 1.
+        shuffle:
+            Whether to shuffle the data before splitting.
+
+        Returns
+        -------
+        first_dataset:
+            The first dataset.
+        second_dataset:
+            The second dataset.
+
+        Raises
+        ------
+        OutOfBoundsError
+            If `percentage_in_first` is not between 0 and 1.
+        """
+        import torch
+
+        _check_bounds(
+            "percentage_in_first",
+            percentage_in_first,
+            lower_bound=_ClosedBound(0),
+            upper_bound=_ClosedBound(1),
+        )
+
+        first_dataset: ImageDataset[Out_co] = copy.copy(self)
+        second_dataset: ImageDataset[Out_co] = copy.copy(self)
+
+        if shuffle:
+            shuffled_indices = torch.randperm(len(self._shuffle_tensor_indices))
+        else:
+            shuffled_indices = torch.arange(len(self._shuffle_tensor_indices))
+
+        first_dataset._shuffle_tensor_indices, second_dataset._shuffle_tensor_indices = shuffled_indices.split(
+            [
+                round(percentage_in_first * len(self)),
+                len(self) - round(percentage_in_first * len(self)),
+            ],
+        )
+        return first_dataset, second_dataset
+
 
 class _TableAsTensor:
     def __init__(self, table: Table) -> None:

diff --git a/src/safeds/data/tabular/plotting/_table_plotter.py b/src/safeds/data/tabular/plotting/_table_plotter.py
@@ -97,7 +97,7 @@ def correlation_heatmap(self) -> Image:
         # TODO: implement using matplotlib and polars
         #  https://stackoverflow.com/questions/33282368/plotting-a-2d-heatmap
         import matplotlib.pyplot as plt
-        import seaborn as sns
+        import numpy as np
 
         only_numerical = self._table.remove_non_numeric_columns()._data_frame.fill_null(0)
 
@@ -115,15 +115,18 @@ def correlation_heatmap(self) -> Image:
                     " automatically expanding."
                 ),
             )
-            fig = plt.figure()
-            sns.heatmap(
-                data=only_numerical.corr().to_numpy(),
+
+            fig, ax = plt.subplots()
+            heatmap = plt.imshow(
+                only_numerical.corr().to_numpy(),
                 vmin=-1,
                 vmax=1,
-                xticklabels=only_numerical.columns,
-                yticklabels=only_numerical.columns,
-                cmap="vlag",
+                cmap="coolwarm",
             )
+            ax.set_xticks(np.arange(len(only_numerical.columns)), labels=only_numerical.columns)
+            ax.set_yticks(np.arange(len(only_numerical.columns)), labels=only_numerical.columns)
+            fig.colorbar(heatmap)
+
             plt.tight_layout()
 
         return _figure_to_image(fig)
@@ -353,6 +356,81 @@ def scatter_plot(self, x_name: str, y_names: list[str]) -> Image:
 
         return _figure_to_image(fig)
 
+    def moving_average_plot(self, x_name: str, y_name: str, window_size: int) -> Image:
+        """
+        Create a moving average plot for the y column and plot it by the x column in the table.
+
+        Parameters
+        ----------
+        x_name:
+            The name of the column to be plotted on the x-axis.
+        y_name:
+            The name of the column to be plotted on the y-axis.
+
+        Returns
+        -------
+        plot:
+            The plot as an image.
+
+        Raises
+        ------
+        ColumnNotFoundError
+            If a column does not exist.
+        TypeError
+            If a column is not numeric.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import Table
+        >>> table = Table(
+        ...     {
+        ...         "a": [1, 2, 3, 4, 5],
+        ...         "b": [2, 3, 4, 5, 6],
+        ...     }
+        ... )
+        >>> image = table.plot.moving_average_plot("a", "b", window_size = 2)
+        """
+        import matplotlib.pyplot as plt
+        import numpy as np
+        import polars as pl
+
+        _plot_validation(self._table, x_name, [y_name])
+        for name in [x_name, y_name]:
+            if self._table.get_column(name).missing_value_count() >= 1:
+                raise ValueError(
+                    f"there are missing values in column '{name}', use transformation to fill missing values "
+                    f"or drop the missing values. For a moving average no missing values are allowed.",
+                )
+
+        # Calculate the moving average
+        mean_col = pl.col(y_name).mean().alias(y_name)
+        grouped = self._table._lazy_frame.sort(x_name).group_by(x_name).agg(mean_col).collect()
+        data = grouped
+        moving_average = data.select([pl.col(y_name).rolling_mean(window_size).alias("moving_average")])
+        # set up the arrays for plotting
+        y_data_with_nan = moving_average["moving_average"].to_numpy()
+        nan_mask = ~np.isnan(y_data_with_nan)
+        y_data = y_data_with_nan[nan_mask]
+        x_data = data[x_name].to_numpy()[nan_mask]
+        fig, ax = plt.subplots()
+        ax.plot(x_data, y_data, label="moving average")
+        ax.set(
+            xlabel=x_name,
+            ylabel=y_name,
+        )
+        ax.legend()
+        if self._table.get_column(x_name).is_temporal:
+            ax.set_xticks(x_data)  # Set x-ticks to the x data points
+        ax.set_xticks(ax.get_xticks())
+        ax.set_xticklabels(
+            ax.get_xticklabels(),
+            rotation=45,
+            horizontalalignment="right",
+        )  # rotate the labels of the x Axis to prevent the chance of overlapping of the labels
+        fig.tight_layout()
+
+        return _figure_to_image(fig)
+
 
 def _plot_validation(table: Table, x_name: str, y_names: list[str]) -> None:
     y_names.append(x_name)

diff --git a/src/safeds/data/tabular/transformation/_range_scaler.py b/src/safeds/data/tabular/transformation/_range_scaler.py
@@ -37,13 +37,7 @@ class RangeScaler(InvertibleTableTransformer):
     # Dunder methods
     # ------------------------------------------------------------------------------------------------------------------
 
-    def __init__(
-        self,
-        min_: float = 0.0,
-        max_: float = 1.0,
-        *,
-        column_names: str | list[str] | None = None,
-    ) -> None:
+    def __init__(self, *, column_names: str | list[str] | None = None, min_: float = 0.0, max_: float = 1.0) -> None:
         super().__init__(column_names)
 
         if min_ >= max_:

diff --git a/src/safeds/ml/nn/layers/__init__.py b/src/safeds/ml/nn/layers/__init__.py
@@ -8,6 +8,7 @@
     from ._convolutional2d_layer import Convolutional2DLayer, ConvolutionalTranspose2DLayer
     from ._flatten_layer import FlattenLayer
     from ._forward_layer import ForwardLayer
+    from ._gru_layer import GRULayer
     from ._layer import Layer
     from ._lstm_layer import LSTMLayer
     from ._pooling2d_layer import AveragePooling2DLayer, MaxPooling2DLayer
@@ -21,6 +22,7 @@
         "ForwardLayer": "._forward_layer:ForwardLayer",
         "Layer": "._layer:Layer",
         "LSTMLayer": "._lstm_layer:LSTMLayer",
+        "GRULayer": "._gru_layer:GRULayer",
         "AveragePooling2DLayer": "._pooling2d_layer:AveragePooling2DLayer",
         "MaxPooling2DLayer": "._pooling2d_layer:MaxPooling2DLayer",
     },
@@ -33,6 +35,7 @@
     "ForwardLayer",
     "Layer",
     "LSTMLayer",
+    "GRULayer",
     "AveragePooling2DLayer",
     "MaxPooling2DLayer",
 ]