Skip to content

Commit

Permalink
Merge branch 'main' into 710-easily-create-a-baseline-model
Browse files Browse the repository at this point in the history
  • Loading branch information
sibre28 authored Jun 25, 2024
2 parents e7579a0 + 3878751 commit d5150c6
Show file tree
Hide file tree
Showing 17 changed files with 1,331 additions and 267 deletions.
774 changes: 585 additions & 189 deletions docs/tutorials/data_processing.ipynb

Large diffs are not rendered by default.

72 changes: 38 additions & 34 deletions docs/tutorials/data_visualization.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/safeds/data/image/containers/_image_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def __contains__(self, item: object) -> bool:
Returns
-------
has_item:
Weather the given item is in this image list
Whether the given item is in this image list
"""
return isinstance(item, Image) and self.has_image(item)

Expand Down Expand Up @@ -524,7 +524,7 @@ def has_image(self, image: Image) -> bool:
Returns
-------
has_image:
Weather the given image is in this image list
Whether the given image is in this image list
"""

# ------------------------------------------------------------------------------------------------------------------
Expand Down
126 changes: 99 additions & 27 deletions src/safeds/data/labeled/containers/_image_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class ImageDataset(Dataset[ImageList, Out_co]):
batch_size:
the batch size used for training
shuffle:
weather the data should be shuffled after each epoch of training
whether the data should be shuffled after each epoch of training
"""

def __init__(self, input_data: ImageList, output_data: Out_co, batch_size: int = 1, shuffle: bool = False) -> None:
Expand Down Expand Up @@ -108,13 +108,13 @@ def __iter__(self) -> ImageDataset:
return im_ds

def __next__(self) -> tuple[Tensor, Tensor]:
if self._next_batch_index * self._batch_size >= len(self._input):
if self._next_batch_index * self._batch_size >= len(self._shuffle_tensor_indices):
raise StopIteration
self._next_batch_index += 1
return self._get_batch(self._next_batch_index - 1)

def __len__(self) -> int:
return self._input.image_count
return len(self._shuffle_tensor_indices)

def __eq__(self, other: object) -> bool:
"""
Expand All @@ -138,6 +138,7 @@ def __eq__(self, other: object) -> bool:
and isinstance(other._output, type(self._output))
and (self._input == other._input)
and (self._output == other._output)
and (self._shuffle_tensor_indices.tolist() == other._shuffle_tensor_indices.tolist())
)

def __hash__(self) -> int:
Expand All @@ -149,7 +150,13 @@ def __hash__(self) -> int:
hash:
the hash value
"""
return _structural_hash(self._input, self._output, self._shuffle_after_epoch, self._batch_size)
return _structural_hash(
self._input,
self._output,
self._shuffle_after_epoch,
self._batch_size,
self._shuffle_tensor_indices.tolist(),
)

def __sizeof__(self) -> int:
"""
Expand Down Expand Up @@ -205,7 +212,7 @@ def get_input(self) -> ImageList:
input:
the input data of this dataset
"""
return self._sort_image_list_with_shuffle_tensor_indices(self._input)
return self._sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(self._input)

def get_output(self) -> Out_co:
"""
Expand All @@ -222,19 +229,25 @@ def get_output(self) -> Out_co:
elif isinstance(output, _ColumnAsTensor):
return output._to_column(self._shuffle_tensor_indices) # type: ignore[return-value]
else:
return self._sort_image_list_with_shuffle_tensor_indices(self._output) # type: ignore[return-value]
return self._sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(self._output) # type: ignore[return-value]

def _sort_image_list_with_shuffle_tensor_indices(self, image_list: _SingleSizeImageList) -> _SingleSizeImageList:
def _sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(
self,
image_list: _SingleSizeImageList,
) -> _SingleSizeImageList:
shuffled_image_list = _SingleSizeImageList()
shuffled_image_list._tensor = image_list._tensor
shuffled_image_list._indices_to_tensor_positions = {
index: self._shuffle_tensor_indices[tensor_position].item()
for index, tensor_position in image_list._indices_to_tensor_positions.items()
tensor_pos = [
image_list._indices_to_tensor_positions[shuffled_index]
for shuffled_index in sorted(self._shuffle_tensor_indices.tolist())
]
temp_pos = {
shuffled_index: new_index for new_index, shuffled_index in enumerate(self._shuffle_tensor_indices.tolist())
}
shuffled_image_list._tensor = image_list._tensor[tensor_pos]
shuffled_image_list._tensor_positions_to_indices = [
index
for index, _ in sorted(shuffled_image_list._indices_to_tensor_positions.items(), key=lambda item: item[1])
new_index for _, new_index in sorted(temp_pos.items(), key=lambda item: item[0])
]
shuffled_image_list._indices_to_tensor_positions = shuffled_image_list._calc_new_indices_to_tensor_positions()
return shuffled_image_list

def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[Tensor, Tensor]:
Expand All @@ -247,18 +260,18 @@ def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[

_check_bounds("batch_size", batch_size, lower_bound=_ClosedBound(1))

if batch_number < 0 or batch_size * batch_number >= len(self._input):
if batch_number < 0 or batch_size * batch_number >= len(self._shuffle_tensor_indices):
raise IndexOutOfBoundsError(batch_size * batch_number)
max_index = (
batch_size * (batch_number + 1) if batch_size * (batch_number + 1) < len(self._input) else len(self._input)
batch_size * (batch_number + 1)
if batch_size * (batch_number + 1) < len(self._shuffle_tensor_indices)
else len(self._shuffle_tensor_indices)
)
input_tensor = (
self._input._tensor[
self._shuffle_tensor_indices[
[
self._input._indices_to_tensor_positions[index]
for index in range(batch_size * batch_number, max_index)
]
[
self._input._indices_to_tensor_positions[index]
for index in self._shuffle_tensor_indices[batch_size * batch_number : max_index].tolist()
]
].to(torch.float32)
/ 255
Expand All @@ -267,11 +280,9 @@ def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[
if isinstance(self._output, _SingleSizeImageList):
output_tensor = (
self._output._tensor[
self._shuffle_tensor_indices[
[
self._output._indices_to_tensor_positions[index]
for index in range(batch_size * batch_number, max_index)
]
[
self._input._indices_to_tensor_positions[index]
for index in self._shuffle_tensor_indices[batch_size * batch_number : max_index].tolist()
]
].to(torch.float32)
/ 255
Expand All @@ -284,7 +295,7 @@ def shuffle(self) -> ImageDataset[Out_co]:
"""
Return a new `ImageDataset` with shuffled data.
The original dataset list is not modified.
The original dataset is not modified.
Returns
-------
Expand All @@ -296,10 +307,71 @@ def shuffle(self) -> ImageDataset[Out_co]:
_init_default_device()

im_dataset: ImageDataset[Out_co] = copy.copy(self)
im_dataset._shuffle_tensor_indices = torch.randperm(len(self))
im_dataset._shuffle_tensor_indices = self._shuffle_tensor_indices[
torch.randperm(len(self._shuffle_tensor_indices))
]
im_dataset._next_batch_index = 0
return im_dataset

def split(
self,
percentage_in_first: float,
*,
shuffle: bool = True,
) -> tuple[ImageDataset[Out_co], ImageDataset[Out_co]]:
"""
Create two image datasets by splitting the data of the current dataset.
The first dataset contains a percentage of the data specified by `percentage_in_first`, and the second dataset
contains the remaining data.
The original dataset is not modified.
By default, the data is shuffled before splitting. You can disable this by setting `shuffle` to False.
Parameters
----------
percentage_in_first:
The percentage of data to include in the first dataset. Must be between 0 and 1.
shuffle:
Whether to shuffle the data before splitting.
Returns
-------
first_dataset:
The first dataset.
second_dataset:
The second dataset.
Raises
------
OutOfBoundsError
If `percentage_in_first` is not between 0 and 1.
"""
import torch

_check_bounds(
"percentage_in_first",
percentage_in_first,
lower_bound=_ClosedBound(0),
upper_bound=_ClosedBound(1),
)

first_dataset: ImageDataset[Out_co] = copy.copy(self)
second_dataset: ImageDataset[Out_co] = copy.copy(self)

if shuffle:
shuffled_indices = torch.randperm(len(self._shuffle_tensor_indices))
else:
shuffled_indices = torch.arange(len(self._shuffle_tensor_indices))

first_dataset._shuffle_tensor_indices, second_dataset._shuffle_tensor_indices = shuffled_indices.split(
[
round(percentage_in_first * len(self)),
len(self) - round(percentage_in_first * len(self)),
],
)
return first_dataset, second_dataset


class _TableAsTensor:
def __init__(self, table: Table) -> None:
Expand Down
92 changes: 85 additions & 7 deletions src/safeds/data/tabular/plotting/_table_plotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def correlation_heatmap(self) -> Image:
# TODO: implement using matplotlib and polars
# https://stackoverflow.com/questions/33282368/plotting-a-2d-heatmap
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

only_numerical = self._table.remove_non_numeric_columns()._data_frame.fill_null(0)

Expand All @@ -115,15 +115,18 @@ def correlation_heatmap(self) -> Image:
" automatically expanding."
),
)
fig = plt.figure()
sns.heatmap(
data=only_numerical.corr().to_numpy(),

fig, ax = plt.subplots()
heatmap = plt.imshow(
only_numerical.corr().to_numpy(),
vmin=-1,
vmax=1,
xticklabels=only_numerical.columns,
yticklabels=only_numerical.columns,
cmap="vlag",
cmap="coolwarm",
)
ax.set_xticks(np.arange(len(only_numerical.columns)), labels=only_numerical.columns)
ax.set_yticks(np.arange(len(only_numerical.columns)), labels=only_numerical.columns)
fig.colorbar(heatmap)

plt.tight_layout()

return _figure_to_image(fig)
Expand Down Expand Up @@ -353,6 +356,81 @@ def scatter_plot(self, x_name: str, y_names: list[str]) -> Image:

return _figure_to_image(fig)

def moving_average_plot(self, x_name: str, y_name: str, window_size: int) -> Image:
"""
Create a moving average plot for the y column and plot it by the x column in the table.
Parameters
----------
x_name:
The name of the column to be plotted on the x-axis.
y_name:
The name of the column to be plotted on the y-axis.
Returns
-------
plot:
The plot as an image.
Raises
------
ColumnNotFoundError
If a column does not exist.
TypeError
If a column is not numeric.
Examples
--------
>>> from safeds.data.tabular.containers import Table
>>> table = Table(
... {
... "a": [1, 2, 3, 4, 5],
... "b": [2, 3, 4, 5, 6],
... }
... )
>>> image = table.plot.moving_average_plot("a", "b", window_size = 2)
"""
import matplotlib.pyplot as plt
import numpy as np
import polars as pl

_plot_validation(self._table, x_name, [y_name])
for name in [x_name, y_name]:
if self._table.get_column(name).missing_value_count() >= 1:
raise ValueError(
f"there are missing values in column '{name}', use transformation to fill missing values "
f"or drop the missing values. For a moving average no missing values are allowed.",
)

# Calculate the moving average
mean_col = pl.col(y_name).mean().alias(y_name)
grouped = self._table._lazy_frame.sort(x_name).group_by(x_name).agg(mean_col).collect()
data = grouped
moving_average = data.select([pl.col(y_name).rolling_mean(window_size).alias("moving_average")])
# set up the arrays for plotting
y_data_with_nan = moving_average["moving_average"].to_numpy()
nan_mask = ~np.isnan(y_data_with_nan)
y_data = y_data_with_nan[nan_mask]
x_data = data[x_name].to_numpy()[nan_mask]
fig, ax = plt.subplots()
ax.plot(x_data, y_data, label="moving average")
ax.set(
xlabel=x_name,
ylabel=y_name,
)
ax.legend()
if self._table.get_column(x_name).is_temporal:
ax.set_xticks(x_data) # Set x-ticks to the x data points
ax.set_xticks(ax.get_xticks())
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment="right",
) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels
fig.tight_layout()

return _figure_to_image(fig)


def _plot_validation(table: Table, x_name: str, y_names: list[str]) -> None:
y_names.append(x_name)
Expand Down
8 changes: 1 addition & 7 deletions src/safeds/data/tabular/transformation/_range_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,7 @@ class RangeScaler(InvertibleTableTransformer):
# Dunder methods
# ------------------------------------------------------------------------------------------------------------------

def __init__(
self,
min_: float = 0.0,
max_: float = 1.0,
*,
column_names: str | list[str] | None = None,
) -> None:
def __init__(self, *, column_names: str | list[str] | None = None, min_: float = 0.0, max_: float = 1.0) -> None:
super().__init__(column_names)

if min_ >= max_:
Expand Down
3 changes: 3 additions & 0 deletions src/safeds/ml/nn/layers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ._convolutional2d_layer import Convolutional2DLayer, ConvolutionalTranspose2DLayer
from ._flatten_layer import FlattenLayer
from ._forward_layer import ForwardLayer
from ._gru_layer import GRULayer
from ._layer import Layer
from ._lstm_layer import LSTMLayer
from ._pooling2d_layer import AveragePooling2DLayer, MaxPooling2DLayer
Expand All @@ -21,6 +22,7 @@
"ForwardLayer": "._forward_layer:ForwardLayer",
"Layer": "._layer:Layer",
"LSTMLayer": "._lstm_layer:LSTMLayer",
"GRULayer": "._gru_layer:GRULayer",
"AveragePooling2DLayer": "._pooling2d_layer:AveragePooling2DLayer",
"MaxPooling2DLayer": "._pooling2d_layer:MaxPooling2DLayer",
},
Expand All @@ -33,6 +35,7 @@
"ForwardLayer",
"Layer",
"LSTMLayer",
"GRULayer",
"AveragePooling2DLayer",
"MaxPooling2DLayer",
]
Loading

0 comments on commit d5150c6

Please sign in to comment.