Skip to content

Commit

Permalink
feat: specify partial order in label encoder (#763)
Browse files Browse the repository at this point in the history
Closes #639 

### Summary of Changes

* Optionally specify a partial order of labels in the label encoder
* Performance: Implement RangeScaler, StandardScaler, LabelEncoder with
polars

---------

Co-authored-by: megalinter-bot <[email protected]>
  • Loading branch information
lars-reimann and megalinter-bot authored May 14, 2024
1 parent 74cc701 commit 6fbe537
Show file tree
Hide file tree
Showing 60 changed files with 672 additions and 1,242 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ omit = [
]

[tool.pytest.ini_options]
addopts = "--snapshot-warn-unused"
addopts = "--snapshot-warn-unused --tb=short"
filterwarnings = [
"ignore:Deprecated call to `pkg_resources.declare_namespace",
"ignore:Jupyter is migrating its paths to use standard platformdirs"
Expand Down
61 changes: 61 additions & 0 deletions src/safeds/_validation/_check_columns_are_numeric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from safeds.exceptions import ColumnTypeError

if TYPE_CHECKING:
from collections.abc import Container

from safeds.data.tabular.containers import Table
from safeds.data.tabular.typing import Schema


def _check_columns_are_numeric(
table_or_schema: Table | Schema,
column_names: str | list[str],
*,
operation: str = "do a numeric operation",
) -> None:
"""
Check if the columns with the specified names are numeric and raise an error if they are not.
Missing columns are ignored. Use `_check_columns_exist` to check for missing columns.
Parameters
----------
table_or_schema:
The table or schema to check.
column_names:
The column names to check.
operation:
The operation that is performed on the columns. This is used in the error message.
Raises
------
ColumnTypeError
If a column exists but is not numeric.
"""
from safeds.data.tabular.containers import Table # circular import

if isinstance(table_or_schema, Table):
table_or_schema = table_or_schema.schema
if isinstance(column_names, str):
column_names = [column_names]

if len(column_names) > 1:
# Create a set for faster containment checks
known_names: Container = set(table_or_schema.column_names)
else:
known_names = table_or_schema.column_names

non_numeric_names = [
name for name in column_names if name in known_names and not table_or_schema.get_column_type(name).is_numeric
]
if non_numeric_names:
message = _build_error_message(non_numeric_names, operation)
raise ColumnTypeError(message)


def _build_error_message(non_numeric_names: list[str], operation: str) -> str:
return f"Tried to {operation} on non-numeric columns {non_numeric_names}."
6 changes: 4 additions & 2 deletions src/safeds/data/image/containers/_image.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

import io
import os.path
import sys
import warnings
from pathlib import Path
Expand Down Expand Up @@ -79,9 +78,12 @@ def from_file(path: str | Path) -> Image:
"""
from torchvision.io import read_image

if isinstance(path, str):
path = Path(path)

_init_default_device()

if not os.path.isfile(path):
if not path.is_file():
raise FileNotFoundError(f"No such file or directory: '{path}'")

return Image(image_tensor=read_image(str(path)).to(_get_device()))
Expand Down
2 changes: 0 additions & 2 deletions src/safeds/data/image/containers/_image_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,6 @@ def from_files(
return image_list

class _FromFileThreadPackage:

def __init__(
self,
im_files: list[str],
Expand Down Expand Up @@ -323,7 +322,6 @@ def __len__(self) -> int:
return len(self._im_files)

class _FromImageThread(Thread):

def __init__(self, packages: list[ImageList._FromFileThreadPackage]) -> None:
super().__init__()
self._packages = packages
Expand Down
4 changes: 2 additions & 2 deletions src/safeds/data/image/containers/_multi_size_image_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _create_from_single_sized_image_lists(single_size_image_lists: list[_SingleS
single_size_image_list._indices_to_tensor_positions.keys(),
[image_size] * len(single_size_image_list),
strict=False,
)
),
)
if max_channel is None:
max_channel = single_size_image_list.channel
Expand All @@ -80,7 +80,7 @@ def _create_from_single_sized_image_lists(single_size_image_lists: list[_SingleS
for size in image_list._image_list_dict:
if max_channel is not None and image_list._image_list_dict[size].channel != max_channel:
image_list._image_list_dict[size] = image_list._image_list_dict[size].change_channel(
int(max_channel)
int(max_channel),
)
return image_list

Expand Down
9 changes: 7 additions & 2 deletions src/safeds/data/image/containers/_single_size_image_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path
from typing import TYPE_CHECKING

from safeds._config import _init_default_device, _get_device
from safeds._config import _get_device, _init_default_device
from safeds._utils import _structural_hash
from safeds.data.image._utils._image_transformation_error_and_warning_checks import (
_check_add_noise_errors,
Expand Down Expand Up @@ -82,7 +82,12 @@ def _create_image_list_from_files(
image_list = _SingleSizeImageList()

images_tensor = torch.empty(
number_of_images, max_channel, height, width, dtype=torch.uint8, device=_get_device()
number_of_images,
max_channel,
height,
width,
dtype=torch.uint8,
device=_get_device(),
)

thread_packages: list[ImageList._FromFileThreadPackage] = []
Expand Down
12 changes: 6 additions & 6 deletions src/safeds/data/labeled/containers/_image_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def __init__(self, input_data: ImageList, output_data: T, batch_size: int = 1, s
_output_size: int | ImageSize = output_data.number_of_columns
elif isinstance(output_data, Column):
_column_as_tensor = _ColumnAsTensor(output_data)
_output_size = len(_column_as_tensor._one_hot_encoder.get_names_of_added_columns())
_output_size = len(_column_as_tensor._one_hot_encoder._get_names_of_added_columns())
_output = _column_as_tensor
elif isinstance(output_data, _SingleSizeImageList):
_output = output_data._clone()._as_single_size_image_list()
Expand Down Expand Up @@ -289,7 +289,6 @@ def shuffle(self) -> ImageDataset[T]:


class _TableAsTensor:

def __init__(self, table: Table) -> None:
import torch

Expand Down Expand Up @@ -345,7 +344,6 @@ def _to_table(self) -> Table:


class _ColumnAsTensor:

def __init__(self, column: Column) -> None:
import torch

Expand All @@ -359,6 +357,8 @@ def __init__(self, column: Column) -> None:
message=rf"The columns \['{self._column_name}'\] contain numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical values",
category=UserWarning,
)
# TODO: should not one-hot-encode the target. label encoding without order is sufficient. should also not
# be done automatically?
self._one_hot_encoder = OneHotEncoder().fit(column_as_table, [self._column_name])
self._tensor = torch.Tensor(self._one_hot_encoder.transform(column_as_table)._data_frame.to_torch()).to(
_get_device(),
Expand Down Expand Up @@ -394,9 +394,9 @@ def _from_tensor(tensor: Tensor, column_name: str, one_hot_encoder: OneHotEncode
raise ValueError(f"Tensor has an invalid amount of dimensions. Needed 2 dimensions but got {tensor.dim()}.")
if not one_hot_encoder.is_fitted:
raise TransformerNotFittedError
if tensor.size(dim=1) != len(one_hot_encoder.get_names_of_added_columns()):
if tensor.size(dim=1) != len(one_hot_encoder._get_names_of_added_columns()):
raise ValueError(
f"Tensor and one_hot_encoder have different amounts of classes ({tensor.size(dim=1)}!={len(one_hot_encoder.get_names_of_added_columns())}).",
f"Tensor and one_hot_encoder have different amounts of classes ({tensor.size(dim=1)}!={len(one_hot_encoder._get_names_of_added_columns())}).",
)
table_as_tensor = _ColumnAsTensor.__new__(_ColumnAsTensor)
table_as_tensor._tensor = tensor
Expand All @@ -406,6 +406,6 @@ def _from_tensor(tensor: Tensor, column_name: str, one_hot_encoder: OneHotEncode

def _to_column(self) -> Column:
table = Table(
dict(zip(self._one_hot_encoder.get_names_of_added_columns(), self._tensor.T.tolist(), strict=False)),
dict(zip(self._one_hot_encoder._get_names_of_added_columns(), self._tensor.T.tolist(), strict=False)),
)
return self._one_hot_encoder.inverse_transform(table).get_column(self._column_name)
12 changes: 10 additions & 2 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None:

# Implementation
self._lazy_frame: pl.LazyFrame = pl.LazyFrame(data)
self.__data_frame_cache: pl.DataFrame | None = None
self.__data_frame_cache: pl.DataFrame | None = None # Scramble the name to prevent access from outside

def __eq__(self, other: object) -> bool:
if not isinstance(other, Table):
Expand Down Expand Up @@ -1033,6 +1033,9 @@ def remove_duplicate_rows(self) -> Table:
| 2 | 5 |
+-----+-----+
"""
if self.number_of_columns == 0:
return self # Workaround for https://github.com/pola-rs/polars/issues/16207

return Table._from_polars_lazy_frame(
self._lazy_frame.unique(maintain_order=True),
)
Expand Down Expand Up @@ -1221,6 +1224,8 @@ def remove_rows_with_outliers(
| null | 8 |
+------+-----+
"""
if self.number_of_rows == 0:
return self # polars raises a ComputeError for tables without rows
if column_names is None:
column_names = self.column_names

Expand Down Expand Up @@ -1440,7 +1445,10 @@ def split_rows(
The first table contains a percentage of the rows specified by `percentage_in_first`, and the second table
contains the remaining rows.
**Note:** The original table is not modified.
**Notes:**
- The original table is not modified.
- By default, the rows are shuffled before splitting. You can disable this by setting `shuffle` to False.
Parameters
----------
Expand Down
87 changes: 26 additions & 61 deletions src/safeds/data/tabular/transformation/_discretizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from typing import TYPE_CHECKING

from safeds._utils import _structural_hash
from safeds._validation import _check_bounds, _check_columns_exist, _ClosedBound
from safeds.data.tabular.containers import Table
from safeds.exceptions import (
Expand Down Expand Up @@ -30,13 +31,36 @@ class Discretizer(TableTransformer):
If the given number_of_bins is less than 2.
"""

def __init__(self, number_of_bins: int = 5):
# ------------------------------------------------------------------------------------------------------------------
# Dunder methods
# ------------------------------------------------------------------------------------------------------------------

def __init__(self, number_of_bins: int = 5) -> None:
TableTransformer.__init__(self)

_check_bounds("number_of_bins", number_of_bins, lower_bound=_ClosedBound(2))

self._column_names: list[str] | None = None
self._wrapped_transformer: sk_KBinsDiscretizer | None = None
self._number_of_bins = number_of_bins

def __hash__(self) -> int:
return _structural_hash(
TableTransformer.__hash__(self),
self._number_of_bins,
)

# ------------------------------------------------------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------------------------------------------------------

@property
def number_of_bins(self) -> int:
return self._number_of_bins

# ------------------------------------------------------------------------------------------------------------------
# Learning and transformation
# ------------------------------------------------------------------------------------------------------------------

def fit(self, table: Table, column_names: list[str] | None) -> Discretizer:
"""
Learn a transformation for a set of columns in a table.
Expand Down Expand Up @@ -137,62 +161,3 @@ def transform(self, table: Table) -> Table:
return Table._from_polars_lazy_frame(
table._lazy_frame.update(new_data.lazy()),
)

@property
def is_fitted(self) -> bool:
"""Whether the transformer is fitted."""
return self._wrapped_transformer is not None

def get_names_of_added_columns(self) -> list[str]:
"""
Get the names of all new columns that have been added by the Discretizer.
Returns
-------
added_columns:
A list of names of the added columns, ordered as they will appear in the table.
Raises
------
TransformerNotFittedError
If the transformer has not been fitted yet.
"""
if not self.is_fitted:
raise TransformerNotFittedError
return []

def get_names_of_changed_columns(self) -> list[str]:
"""
Get the names of all columns that may have been changed by the Discretizer.
Returns
-------
changed_columns:
The list of (potentially) changed column names, as passed to fit.
Raises
------
TransformerNotFittedError
If the transformer has not been fitted yet.
"""
if self._column_names is None:
raise TransformerNotFittedError
return self._column_names

def get_names_of_removed_columns(self) -> list[str]:
"""
Get the names of all columns that have been removed by the Discretizer.
Returns
-------
removed_columns:
A list of names of the removed columns, ordered as they appear in the table the Discretizer was fitted on.
Raises
------
TransformerNotFittedError
If the transformer has not been fitted yet.
"""
if not self.is_fitted:
raise TransformerNotFittedError
return []
Loading

0 comments on commit 6fbe537

Please sign in to comment.