Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: consistent selector parameters #983

Merged
merged 5 commits into from
Jan 14, 2025
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat: consistently name selector parameters in TableTransformers
lars-reimann committed Jan 14, 2025
commit 95e5459857aa5471ec0ab809f89b152fe8be6305
4 changes: 2 additions & 2 deletions docs/tutorials/classification.ipynb
Original file line number Diff line number Diff line change
@@ -208,7 +208,7 @@
"source": [
"from safeds.data.tabular.transformation import SimpleImputer\n",
"\n",
"simple_imputer = SimpleImputer(column_names=[\"age\", \"fare\"], strategy=SimpleImputer.Strategy.mean())\n",
"simple_imputer = SimpleImputer(selector=[\"age\", \"fare\"], strategy=SimpleImputer.Strategy.mean())\n",
"fitted_simple_imputer_train, transformed_train_data = simple_imputer.fit_and_transform(train_table)\n",
"transformed_test_data = fitted_simple_imputer_train.transform(test_table)"
]
@@ -241,7 +241,7 @@
"from safeds.data.tabular.transformation import OneHotEncoder\n",
"\n",
"fitted_one_hot_encoder_train, transformed_train_data = OneHotEncoder(\n",
" column_names=[\"sex\", \"port_embarked\"],\n",
" selector=[\"sex\", \"port_embarked\"],\n",
").fit_and_transform(transformed_train_data)\n",
"transformed_test_data = fitted_one_hot_encoder_train.transform(transformed_test_data)"
]
10 changes: 5 additions & 5 deletions docs/tutorials/data_processing.ipynb
Original file line number Diff line number Diff line change
@@ -510,7 +510,7 @@
"source": [
"from safeds.data.tabular.transformation import SimpleImputer\n",
"\n",
"imputer = SimpleImputer(SimpleImputer.Strategy.constant(0), column_names=[\"age\", \"fare\", \"cabin\", \"port_embarked\"]).fit(\n",
"imputer = SimpleImputer(SimpleImputer.Strategy.constant(0), selector=[\"age\", \"fare\", \"cabin\", \"port_embarked\"]).fit(\n",
" titanic,\n",
")\n",
"imputer.transform(titanic_slice)"
@@ -583,7 +583,7 @@
"source": [
"from safeds.data.tabular.transformation import LabelEncoder\n",
"\n",
"encoder = LabelEncoder(column_names=[\"sex\", \"port_embarked\"]).fit(titanic)\n",
"encoder = LabelEncoder(selector=[\"sex\", \"port_embarked\"]).fit(titanic)\n",
"encoder.transform(titanic_slice)"
]
},
@@ -674,7 +674,7 @@
"source": [
"from safeds.data.tabular.transformation import OneHotEncoder\n",
"\n",
"encoder = OneHotEncoder(column_names=[\"sex\", \"port_embarked\"]).fit(titanic)\n",
"encoder = OneHotEncoder(selector=[\"sex\", \"port_embarked\"]).fit(titanic)\n",
"encoder.transform(titanic_slice)"
]
},
@@ -745,7 +745,7 @@
"source": [
"from safeds.data.tabular.transformation import RangeScaler\n",
"\n",
"scaler = RangeScaler(column_names=\"age\", min_=0.0, max_=1.0).fit(titanic)\n",
"scaler = RangeScaler(selector=\"age\", min_=0.0, max_=1.0).fit(titanic)\n",
"scaler.transform(titanic_slice)"
]
},
@@ -816,7 +816,7 @@
"source": [
"from safeds.data.tabular.transformation import StandardScaler\n",
"\n",
"scaler = StandardScaler(column_names=[\"age\", \"travel_class\"]).fit(titanic)\n",
"scaler = StandardScaler(selector=[\"age\", \"travel_class\"]).fit(titanic)\n",
"scaler.transform(titanic_slice)"
]
},
2 changes: 1 addition & 1 deletion src/safeds/data/labeled/containers/_image_dataset.py
Original file line number Diff line number Diff line change
@@ -448,7 +448,7 @@ def __init__(self, column: Column) -> None:
)
# TODO: should not one-hot-encode the target. label encoding without order is sufficient. should also not
# be done automatically?
self._one_hot_encoder = OneHotEncoder(column_names=self._column_name).fit(column_as_table)
self._one_hot_encoder = OneHotEncoder(selector=self._column_name).fit(column_as_table)
self._tensor = torch.Tensor(
self._one_hot_encoder.transform(column_as_table)._data_frame.to_torch(dtype=pl.Float32),
).to(_get_device())
20 changes: 10 additions & 10 deletions src/safeds/data/tabular/transformation/_discretizer.py
Original file line number Diff line number Diff line change
@@ -24,7 +24,7 @@ class Discretizer(TableTransformer):
----------
bin_count:
The number of bins to be created.
column_names:
selector:
The list of columns used to fit the transformer. If `None`, all numeric columns are used.

Raises
@@ -41,9 +41,9 @@ def __init__(
self,
bin_count: int = 5,
*,
column_names: str | list[str] | None = None,
selector: str | list[str] | None = None,
) -> None:
TableTransformer.__init__(self, column_names)
TableTransformer.__init__(self, selector)

_check_bounds("bin_count", bin_count, lower_bound=_ClosedBound(2))

@@ -104,10 +104,10 @@ def fit(self, table: Table) -> Discretizer:
if table.row_count == 0:
raise ValueError("The Discretizer cannot be fitted because the table contains 0 rows")

if self._column_names is None:
if self._selector is None:
column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric]
else:
column_names = self._column_names
column_names = self._selector
_check_columns_exist(table, column_names)
_check_columns_are_numeric(table, column_names, operation="fit a Discretizer")

@@ -117,7 +117,7 @@ def fit(self, table: Table) -> Discretizer:
table.select_columns(column_names)._data_frame,
)

result = Discretizer(self._bin_count, column_names=column_names)
result = Discretizer(self._bin_count, selector=column_names)
result._wrapped_transformer = wrapped_transformer

return result
@@ -150,21 +150,21 @@ def transform(self, table: Table) -> Table:
If one of the columns, that should be fitted is non-numeric.
"""
# Transformer has not been fitted yet
if self._wrapped_transformer is None or self._column_names is None:
if self._wrapped_transformer is None or self._selector is None:
raise NotFittedError(kind="transformer")

if table.row_count == 0:
raise ValueError("The table cannot be transformed because it contains 0 rows")

# Input table does not contain all columns used to fit the transformer
_check_columns_exist(table, self._column_names)
_check_columns_exist(table, self._selector)

for column in self._column_names:
for column in self._selector:
if not table.get_column(column).type.is_numeric:
raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.")

new_data = self._wrapped_transformer.transform(
table.select_columns(self._column_names)._data_frame,
table.select_columns(self._selector)._data_frame,
)
return Table._from_polars_lazy_frame(
table._lazy_frame.update(new_data.lazy()),
Original file line number Diff line number Diff line change
@@ -21,7 +21,7 @@ class KNearestNeighborsImputer(TableTransformer):
----------
neighbor_count:
The number of neighbors to consider when imputing missing values.
column_names:
selector:
The list of columns used to impute missing values. If 'None', all columns are used.
value_to_replace:
The placeholder for the missing values. All occurrences of`missing_values` will be imputed.
@@ -35,10 +35,10 @@ def __init__(
self,
neighbor_count: int,
*,
column_names: str | list[str] | None = None,
selector: str | list[str] | None = None,
value_to_replace: float | str | None = None,
) -> None:
super().__init__(column_names)
super().__init__(selector)

_check_bounds(name="neighbor_count", actual=neighbor_count, lower_bound=_ClosedBound(1))

@@ -106,10 +106,10 @@ def fit(self, table: Table) -> KNearestNeighborsImputer:
if table.row_count == 0:
raise ValueError("The KNearestNeighborsImputer cannot be fitted because the table contains 0 rows.")

if self._column_names is None:
if self._selector is None:
column_names = table.column_names
else:
column_names = self._column_names
column_names = self._selector
_check_columns_exist(table, column_names)

value_to_replace = self._value_to_replace
@@ -125,7 +125,7 @@ def fit(self, table: Table) -> KNearestNeighborsImputer:
table.select_columns(column_names)._data_frame,
)

result = KNearestNeighborsImputer(self._neighbor_count, column_names=column_names)
result = KNearestNeighborsImputer(self._neighbor_count, selector=column_names)
result._wrapped_transformer = wrapped_transformer

return result
@@ -153,13 +153,13 @@ def transform(self, table: Table) -> Table:
ColumnNotFoundError
If one of the columns, that should be transformed is not in the table.
"""
if self._column_names is None or self._wrapped_transformer is None:
if self._selector is None or self._wrapped_transformer is None:
raise NotFittedError(kind="transformer")

_check_columns_exist(table, self._column_names)
_check_columns_exist(table, self._selector)

new_data = self._wrapped_transformer.transform(
table.select_columns(self._column_names)._data_frame,
table.select_columns(self._selector)._data_frame,
)

return Table._from_polars_lazy_frame(
28 changes: 13 additions & 15 deletions src/safeds/data/tabular/transformation/_label_encoder.py
Original file line number Diff line number Diff line change
@@ -17,7 +17,7 @@ class LabelEncoder(InvertibleTableTransformer):

Parameters
----------
column_names:
selector:
The list of columns used to fit the transformer. If `None`, all non-numeric columns are used.
partial_order:
The partial order of the labels. The labels are encoded in the order of the given list. Additional values are
@@ -31,10 +31,10 @@ class LabelEncoder(InvertibleTableTransformer):
def __init__(
self,
*,
column_names: str | list[str] | None = None,
selector: str | list[str] | None = None,
partial_order: list[Any] | None = None,
) -> None:
super().__init__(column_names)
super().__init__(selector)

if partial_order is None:
partial_order = []
@@ -94,10 +94,10 @@ def fit(self, table: Table) -> LabelEncoder:
ValueError
If the table contains 0 rows.
"""
if self._column_names is None:
if self._selector is None:
column_names = [name for name in table.column_names if not table.get_column_type(name).is_numeric]
else:
column_names = self._column_names
column_names = self._selector
_check_columns_exist(table, column_names)
_warn_if_columns_are_numeric(table, column_names)

@@ -121,7 +121,7 @@ def fit(self, table: Table) -> LabelEncoder:
reverse_mapping[name][label] = value

# Create a copy with the learned transformation
result = LabelEncoder(column_names=column_names, partial_order=self._partial_order)
result = LabelEncoder(selector=column_names, partial_order=self._partial_order)
result._mapping = mapping
result._inverse_mapping = reverse_mapping

@@ -155,14 +155,14 @@ def transform(self, table: Table) -> Table:
import polars as pl

# Used in favor of is_fitted, so the type checker is happy
if self._column_names is None or self._mapping is None:
if self._selector is None or self._mapping is None:
raise NotFittedError(kind="transformer")

_check_columns_exist(table, self._column_names)
_check_columns_exist(table, self._selector)

columns = [
pl.col(name).replace_strict(self._mapping[name], default=None, return_dtype=pl.UInt32)
for name in self._column_names
for name in self._selector
]

return Table._from_polars_lazy_frame(
@@ -197,19 +197,17 @@ def inverse_transform(self, transformed_table: Table) -> Table:
import polars as pl

# Used in favor of is_fitted, so the type checker is happy
if self._column_names is None or self._inverse_mapping is None:
if self._selector is None or self._inverse_mapping is None:
raise NotFittedError(kind="transformer")

_check_columns_exist(transformed_table, self._column_names)
_check_columns_exist(transformed_table, self._selector)
_check_columns_are_numeric(
transformed_table,
self._column_names,
self._selector,
operation="inverse-transform with a LabelEncoder",
)

columns = [
pl.col(name).replace_strict(self._inverse_mapping[name], default=None) for name in self._column_names
]
columns = [pl.col(name).replace_strict(self._inverse_mapping[name], default=None) for name in self._selector]

return Table._from_polars_lazy_frame(
transformed_table._lazy_frame.with_columns(columns),
22 changes: 11 additions & 11 deletions src/safeds/data/tabular/transformation/_one_hot_encoder.py
Original file line number Diff line number Diff line change
@@ -42,7 +42,7 @@ class OneHotEncoder(InvertibleTableTransformer):

Parameters
----------
column_names:
selector:
The list of columns used to fit the transformer. If `None`, all non-numeric columns are used.
separator:
The separator used to separate the original column name from the value in the new column names.
@@ -73,10 +73,10 @@ class OneHotEncoder(InvertibleTableTransformer):
def __init__(
self,
*,
column_names: str | list[str] | None = None,
selector: str | list[str] | None = None,
separator: str = "__",
) -> None:
super().__init__(column_names)
super().__init__(selector)

# Parameters
self._separator = separator
@@ -142,10 +142,10 @@ def fit(self, table: Table) -> OneHotEncoder:
ValueError
If the table contains 0 rows.
"""
if self._column_names is None:
if self._selector is None:
column_names = [name for name in table.column_names if not table.get_column_type(name).is_numeric]
else:
column_names = self._column_names
column_names = self._selector
_check_columns_exist(table, column_names)
_warn_if_columns_are_numeric(table, column_names)

@@ -175,7 +175,7 @@ def fit(self, table: Table) -> OneHotEncoder:
mapping[name].append((new_name, value))

# Create a copy with the learned transformation
result = OneHotEncoder(column_names=column_names, separator=self._separator)
result = OneHotEncoder(selector=column_names, separator=self._separator)
result._new_column_names = new_column_names
result._mapping = mapping

@@ -207,21 +207,21 @@ def transform(self, table: Table) -> Table:
import polars as pl

# Used in favor of is_fitted, so the type checker is happy
if self._column_names is None or self._mapping is None:
if self._selector is None or self._mapping is None:
raise NotFittedError(kind="transformer")

# TODO: raise schema error instead
_check_columns_exist(table, self._column_names)
_check_columns_exist(table, self._selector)

expressions = [
# UInt8 can be used without conversion in scikit-learn
pl.col(column_name).eq_missing(value).alias(new_name).cast(pl.UInt8)
for column_name in self._column_names
for column_name in self._selector
for new_name, value in self._mapping[column_name]
]

return Table._from_polars_lazy_frame(
table._lazy_frame.with_columns(expressions).drop(self._column_names),
table._lazy_frame.with_columns(expressions).drop(self._selector),
)

def inverse_transform(self, transformed_table: Table) -> Table:
@@ -252,7 +252,7 @@ def inverse_transform(self, transformed_table: Table) -> Table:
import polars as pl

# Used in favor of is_fitted, so the type checker is happy
if self._column_names is None or self._new_column_names is None or self._mapping is None:
if self._selector is None or self._new_column_names is None or self._mapping is None:
raise NotFittedError(kind="transformer")

_check_columns_exist(transformed_table, self._new_column_names)
Loading