Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improve transformers for tabular data #108

Merged
merged 11 commits into from
Mar 28, 2023
2 changes: 1 addition & 1 deletion src/safeds/data/tabular/transformation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from ._imputer import Imputer
from ._imputer import Imputer, ImputerStrategy
from ._label_encoder import LabelEncoder
from ._one_hot_encoder import OneHotEncoder
131 changes: 75 additions & 56 deletions src/safeds/data/tabular/transformation/_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,18 @@

import pandas as pd
from safeds.data.tabular.containers import Table
from sklearn.impute import SimpleImputer
from safeds.data.tabular.transformation._table_transformer import TableTransformer
from safeds.exceptions import NotFittedError, UnknownColumnNameError
from sklearn.impute import SimpleImputer as sk_SimpleImputer


class ImputerStrategy(ABC):
@abstractmethod
def _augment_imputer(self, imputer: SimpleImputer) -> None:
def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
pass


# noinspection PyProtectedMember
class Imputer:
class Imputer(TableTransformer):
"""
Impute the data for a given Table.

Expand All @@ -39,7 +40,10 @@ class Constant(ImputerStrategy):
def __init__(self, value: Any):
self._value = value

def _augment_imputer(self, imputer: SimpleImputer) -> None:
def __str__(self) -> str:
return f"Constant({self._value})"

def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
imputer.strategy = "constant"
imputer.fill_value = self._value

Expand All @@ -48,97 +52,112 @@ class Mean(ImputerStrategy):
An imputation strategy for imputing missing data with mean values.
"""

def _augment_imputer(self, imputer: SimpleImputer) -> None:
def __str__(self) -> str:
return "Mean"

def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
imputer.strategy = "mean"

class Median(ImputerStrategy):
"""
An imputation strategy for imputing missing data with median values.
"""

def _augment_imputer(self, imputer: SimpleImputer) -> None:
def __str__(self) -> str:
return "Median"

def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
imputer.strategy = "median"

class Mode(ImputerStrategy):
"""
An imputation strategy for imputing missing data with mode values.
"""

def _augment_imputer(self, imputer: SimpleImputer) -> None:
def __str__(self) -> str:
return "Mode"

def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
imputer.strategy = "most_frequent"

def __init__(self, strategy: ImputerStrategy):
self._imp = SimpleImputer()
strategy._augment_imputer(self._imp)
self._column_names: list[str] = []
self._strategy = strategy

def fit(self, table: Table, column_names: Optional[list[str]] = None) -> None:
self._wrapped_transformer: Optional[sk_SimpleImputer] = None
self._column_names: Optional[list[str]] = None

# noinspection PyProtectedMember
def fit(self, table: Table, column_names: Optional[list[str]] = None) -> Imputer:
"""
Fit the imputer on the dataset.
Learn a transformation for a set of columns in a table.

Parameters
----------
table : Table
The table used to learn the imputation values.
The table used to fit the transformer.
column_names : Optional[list[str]]
An optional list of column names, if the imputer is only supposed to run on specific columns.
The list of columns from the table used to fit the transformer. If `None`, all columns are used.

Returns
-------
fitted_transformer : TableTransformer
The fitted transformer.
"""
if column_names is None:
column_names = table.schema.get_column_names()
column_names = table.get_column_names()
else:
missing_columns = set(column_names) - set(table.get_column_names())
if len(missing_columns) > 0:
raise UnknownColumnNameError(list(missing_columns))

if self._imp.strategy == "most_frequent":
if isinstance(self._strategy, Imputer.Strategy.Mode):
for name in column_names:
if 1 < len(table.get_column(name).mode()):
raise IndexError(
"There are multiple frequent values in a column given for the Imputer"
)
if len(table.get_column(name).mode()) > 1:
raise IndexError("There are multiple most frequent values in a column given for the Imputer")

indices = [table.schema._get_column_index_by_name(name) for name in column_names]

wrapped_transformer = sk_SimpleImputer()
self._strategy._augment_imputer(wrapped_transformer)
wrapped_transformer.fit(table._data[indices])

result = Imputer(self._strategy)
result._wrapped_transformer = wrapped_transformer
result._column_names = column_names

self._column_names = column_names
indices = [
table.schema._get_column_index_by_name(name) for name in self._column_names
]
self._imp.fit(table._data[indices])
return result

# noinspection PyProtectedMember
def transform(self, table: Table) -> Table:
"""
Impute the missing values on the dataset.
Apply the learned transformation to a table.

Parameters
----------
table : Table
The dataset to be imputed.
The table to which the learned transformation is applied.

Returns
-------
table : Table
The dataset with missing values imputed by the given strategy.
"""
data = table._data.copy()
indices = [
table.schema._get_column_index_by_name(name) for name in self._column_names
]
data[indices] = pd.DataFrame(
self._imp.transform(data[indices]), columns=indices
)
return Table(data, table.schema)
transformed_table : Table
The transformed table.

def fit_transform(
self, table: Table, column_names: Optional[list[str]] = None
) -> Table:
Raises
----------
NotFittedError
If the transformer has not been fitted yet.
"""
Fit the imputer on the dataset and impute the missing values.

Parameters
----------
table : Table
The table used to learn the imputation values.
column_names : Optional[list[str]]
An optional list of column names, if the imputer is only supposed to run on specific columns.
# Transformer has not been fitted yet
if self._wrapped_transformer is None or self._column_names is None:
raise NotFittedError()

Returns
-------
table : Table
The dataset with missing values imputed by the given strategy.
"""
self.fit(table, column_names)
return self.transform(table)
# Input table does not contain all columns used to fit the transformer
missing_columns = set(self._column_names) - set(table.get_column_names())
if len(missing_columns) > 0:
raise UnknownColumnNameError(list(missing_columns))

data = table._data.copy()
indices = [table.schema._get_column_index_by_name(name) for name in self._column_names]
data[indices] = pd.DataFrame(self._wrapped_transformer.transform(data[indices]), columns=indices)
return Table(data, table.schema)
Loading