Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: specify extras instead of features in to_tabular_dataset #685

Merged
21 changes: 7 additions & 14 deletions docs/tutorials/classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"\n",
"titanic = Table.from_csv_file(\"data/titanic.csv\")\n",
"#For visualisation purposes we only print out the first 15 rows.\n",
"titanic.slice_rows(0,15)"
"titanic.slice_rows(0, 15)"
],
"metadata": {
"collapsed": false
Expand Down Expand Up @@ -77,7 +77,6 @@
"source": [
"from safeds.data.tabular.transformation import OneHotEncoder\n",
"\n",
"old_column_names = train_table.column_names\n",
"encoder = OneHotEncoder().fit(train_table, [\"sex\"])"
],
"metadata": {
Expand All @@ -97,18 +96,14 @@
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"transformed_table = encoder.transform(train_table)\n",
"new_column_names = transformed_table.column_names\n",
"new_columns= set(new_column_names) - set(old_column_names)"
],
"source": "transformed_table = encoder.transform(train_table)",
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": "5. Mark the `survived` `Column` as the target variable to be predicted. Use the new names of the fitted `Column`s as features, which will be used to make predictions based on the target variable.",
"source": "5. Mark the `survived` `Column` as the target variable to be predicted. Include some columns only as extra columns, which are completely ignored by the model:",
"metadata": {
"collapsed": false
}
Expand All @@ -118,9 +113,9 @@
"execution_count": null,
"outputs": [],
"source": [
"train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", feature_names=[\n",
" *new_columns\n",
"])"
"extra_names = [\"id\", \"name\", \"ticket\", \"cabin\", \"port_embarked\", \"age\", \"fare\"]\n",
"\n",
"train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", extra_names)"
],
"metadata": {
"collapsed": false
Expand Down Expand Up @@ -192,9 +187,7 @@
"encoder = OneHotEncoder().fit(test_table, [\"sex\"])\n",
"testing_table = encoder.transform(testing_table)\n",
"\n",
"test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", feature_names=[\n",
" *new_columns\n",
"])\n",
"test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", extra_names)\n",
"fitted_model.accuracy(test_tabular_dataset)\n"
],
"metadata": {
Expand Down
11 changes: 4 additions & 7 deletions docs/tutorials/regression.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
},
{
"cell_type": "markdown",
"source": "3. Mark the `price` `Column` as the target variable to be predicted. Use the new names of the fitted `Column`s as features, which will be used to make predictions based on the target variable.\n",
"source": "3. Mark the `price` `Column` as the target variable to be predicted. Include the `id` column only as an extra column, which is completely ignored by the model:",
"metadata": {
"collapsed": false
}
Expand All @@ -70,10 +70,9 @@
"execution_count": null,
"outputs": [],
"source": [
"feature_columns = set(train_table.column_names) - set([\"price\", \"id\"])\n",
"extra_names = [\"id\"]\n",
"\n",
"train_tabular_dataset = train_table.to_tabular_dataset(\"price\", feature_names=[\n",
" *feature_columns])\n"
"train_tabular_dataset = train_table.to_tabular_dataset(\"price\", extra_names)\n"
],
"metadata": {
"collapsed": false
Expand Down Expand Up @@ -147,9 +146,7 @@
}
],
"source": [
"test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", feature_names=[\n",
" *feature_columns\n",
"])\n",
"test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", extra_names)\n",
"\n",
"fitted_model.mean_absolute_error(test_tabular_dataset)\n"
],
Expand Down
169 changes: 39 additions & 130 deletions src/safeds/data/labeled/containers/_tabular_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@

from safeds._utils import _structural_hash
from safeds.data.tabular.containers import Column, Table
from safeds.exceptions import (
UnknownColumnNameError,
)

if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
Expand All @@ -22,150 +19,67 @@ class TabularDataset:
"""
A tabular dataset maps feature columns to a target column.

Create a tabular dataset from a mapping of column names to their values.

Parameters
----------
data:
The data.
target_name:
Name of the target column.
feature_names:
Names of the feature columns. If None, all columns except the target column are used.
extra_names:
Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but
the target column are used as features.

Raises
------
ColumnLengthMismatchError
If columns have different lengths.
ValueError
If the target column is also a feature column.
If the target column is also an extra column.
ValueError
If no feature columns are specified.
If no feature columns remains.

Examples
--------
>>> from safeds.data.tabular.containers import Table
>>> table = Table({"col1": ["a", "b"], "col2": [1, 2]})
>>> tabular_dataset = table.to_tabular_dataset("col2", ["col1"])
>>> from safeds.data.labeled.containers import TabularDataset
>>> dataset = TabularDataset(
... {"id": [1, 2, 3], "feature": [4, 5, 6], "target": [1, 2, 3]},
... target_name="target",
... extra_names=["id"]
... )
"""

# ------------------------------------------------------------------------------------------------------------------
# Creation
# ------------------------------------------------------------------------------------------------------------------

@staticmethod
def _from_table(
table: Table,
target_name: str,
feature_names: list[str] | None = None,
) -> TabularDataset:
"""
Create a tabular dataset from a table.

Parameters
----------
table:
The table.
target_name:
Name of the target column.
feature_names:
Names of the feature columns. If None, all columns except the target column are used.

Returns
-------
tabular_dataset:
The created tabular dataset.

Raises
------
UnknownColumnNameError
If target_name matches none of the column names.
ValueError
If the target column is also a feature column.
ValueError
If no feature columns are specified.

Examples
--------
>>> from safeds.data.labeled.containers import TabularDataset
>>> from safeds.data.tabular.containers import Table
>>> table = Table({"col1": ["a", "b", "c", "a"], "col2": [1, 2, 3, 4]})
>>> tabular_dataset = TabularDataset._from_table(table, "col2", ["col1"])
"""
table = table._as_table()
if target_name not in table.column_names:
raise UnknownColumnNameError([target_name])

# If no feature names are specified, use all columns except the target column
if feature_names is None:
feature_names = table.column_names
feature_names.remove(target_name)

# Validate inputs
if target_name in feature_names:
raise ValueError(f"Column '{target_name}' cannot be both feature and target.")
if len(feature_names) == 0:
raise ValueError("At least one feature column must be specified.")

# Create result
result = object.__new__(TabularDataset)

result._table = table
result._features = table.keep_only_columns(feature_names)
result._target = table.get_column(target_name)

return result

# ------------------------------------------------------------------------------------------------------------------
# Dunder methods
# ------------------------------------------------------------------------------------------------------------------

def __init__(
self,
data: Mapping[str, Sequence[Any]],
data: Table | Mapping[str, Sequence[Any]],
target_name: str,
feature_names: list[str] | None = None,
extra_names: list[str] | None = None,
):
"""
Create a tabular dataset from a mapping of column names to their values.

Parameters
----------
data:
The data.
target_name:
Name of the target column.
feature_names:
Names of the feature columns. If None, all columns except the target column are used.

Raises
------
ColumnLengthMismatchError
If columns have different lengths.
ValueError
If the target column is also a feature column.
ValueError
If no feature columns are specified.

Examples
--------
>>> from safeds.data.labeled.containers import TabularDataset
>>> table = TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"])
"""
self._table = Table(data)
# Preprocess inputs
if not isinstance(data, Table):
data = Table(data)
if extra_names is None:
extra_names = []

# If no feature names are specified, use all columns except the target column
if feature_names is None:
feature_names = self._table.column_names
if target_name in feature_names:
feature_names.remove(target_name)
# Derive feature names
feature_names = [name for name in data.column_names if name not in {target_name, *extra_names}]

# Validate inputs
if target_name in feature_names:
raise ValueError(f"Column '{target_name}' cannot be both feature and target.")
if target_name in extra_names:
raise ValueError(f"Column '{target_name}' cannot be both target and extra.")
if len(feature_names) == 0:
raise ValueError("At least one feature column must be specified.")
raise ValueError("At least one feature column must remain.")

self._features: Table = self._table.keep_only_columns(feature_names)
self._target: Column = self._table.get_column(target_name)
# Set attributes
self._table: Table = data
self._features: Table = data.keep_only_columns(feature_names)
self._target: Column = data.get_column(target_name)
self._extras: Table = data.keep_only_columns(extra_names)

def __eq__(self, other: object) -> bool:
"""
Expand Down Expand Up @@ -210,27 +124,22 @@ def __sizeof__(self) -> int:

@property
def features(self) -> Table:
"""
Get the feature columns of the tabular dataset.

Returns
-------
features:
The table containing the feature columns.
"""
"""The feature columns of the tabular dataset."""
return self._features

@property
def target(self) -> Column:
"""The target column of the tabular dataset."""
return self._target

@property
def extras(self) -> Table:
"""
Get the target column of the tabular dataset.
Additional columns of the tabular dataset that are neither features nor target.

Returns
-------
target:
The target column.
These can be used to store additional information about instances, such as IDs.
"""
return self._target
return self._extras

# ------------------------------------------------------------------------------------------------------------------
# Conversion
Expand Down
13 changes: 7 additions & 6 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2412,7 +2412,7 @@ def to_rows(self) -> list[Row]:
for (_, series_row) in self._data.iterrows()
]

def to_tabular_dataset(self, target_name: str, feature_names: list[str] | None = None) -> TabularDataset:
def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = None) -> TabularDataset:
"""
Return a new `TabularDataset` with columns marked as a target column or feature columns.

Expand All @@ -2422,12 +2422,13 @@ def to_tabular_dataset(self, target_name: str, feature_names: list[str] | None =
----------
target_name:
Name of the target column.
feature_names:
Names of the feature columns. If None, all columns except the target column are used.
extra_names:
Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but
the target column are used as features.

Returns
-------
tabular_dataset:
dataset:
A new tabular dataset with the given target and feature names.

Raises
Expand All @@ -2441,11 +2442,11 @@ def to_tabular_dataset(self, target_name: str, feature_names: list[str] | None =
--------
>>> from safeds.data.tabular.containers import Table
>>> table = Table({"item": ["apple", "milk", "beer"], "price": [1.10, 1.19, 1.79], "amount_bought": [74, 72, 51]})
>>> tabular_dataset = table.to_tabular_dataset(target_name="amount_bought", feature_names=["item", "price"])
>>> dataset = table.to_tabular_dataset(target_name="amount_bought", extra_names=["item"])
"""
from safeds.data.labeled.containers import TabularDataset

return TabularDataset._from_table(self, target_name, feature_names)
return TabularDataset(self, target_name, extra_names)

# ------------------------------------------------------------------------------------------------------------------
# IPython integration
Expand Down
Loading