Safe-DS · lars-reimann · May 1, 2024 · May 1, 2024 · May 1, 2024 · May 1, 2024
@@ -29,7 +29,7 @@
     "\n",
     "titanic = Table.from_csv_file(\"data/titanic.csv\")\n",
     "#For visualisation purposes we only print out the first 15 rows.\n",
-    "titanic.slice_rows(0,15)"
+    "titanic.slice_rows(0, 15)"
    ],
    "metadata": {
     "collapsed": false
@@ -77,7 +77,6 @@
    "source": [
     "from safeds.data.tabular.transformation import OneHotEncoder\n",
     "\n",
-    "old_column_names = train_table.column_names\n",
     "encoder = OneHotEncoder().fit(train_table, [\"sex\"])"
    ],
    "metadata": {
@@ -97,18 +96,14 @@
    "cell_type": "code",
    "execution_count": null,
    "outputs": [],
-   "source": [
-    "transformed_table = encoder.transform(train_table)\n",
-    "new_column_names = transformed_table.column_names\n",
-    "new_columns= set(new_column_names) - set(old_column_names)"
-   ],
+   "source": "transformed_table = encoder.transform(train_table)",
    "metadata": {
     "collapsed": false
    }
   },
   {
    "cell_type": "markdown",
-   "source": "5. Mark the `survived` `Column` as the target variable to be predicted. Use the new names of the fitted `Column`s as features, which will be used to make predictions based on the target variable.",
+   "source": "5. Mark the `survived` `Column` as the target variable to be predicted. Include some columns only as extra columns, which are completely ignored by the model:",
    "metadata": {
     "collapsed": false
    }
@@ -118,9 +113,9 @@
    "execution_count": null,
    "outputs": [],
    "source": [
-    "train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", feature_names=[\n",
-    "    *new_columns\n",
-    "])"
+    "extra_names = [\"id\", \"name\", \"ticket\", \"cabin\", \"port_embarked\", \"age\", \"fare\"]\n",
+    "\n",
+    "train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", extra_names)"
    ],
    "metadata": {
     "collapsed": false
@@ -192,9 +187,7 @@
     "encoder = OneHotEncoder().fit(test_table, [\"sex\"])\n",
     "testing_table = encoder.transform(testing_table)\n",
     "\n",
-    "test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", feature_names=[\n",
-    "    *new_columns\n",
-    "])\n",
+    "test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", extra_names)\n",
     "fitted_model.accuracy(test_tabular_dataset)\n"
    ],
    "metadata": {

@@ -60,7 +60,7 @@
   },
   {
    "cell_type": "markdown",
-   "source": "3. Mark the `price` `Column` as the target variable to be predicted. Use the new names of the fitted `Column`s as features, which will be used to make predictions based on the target variable.\n",
+   "source": "3. Mark the `price` `Column` as the target variable to be predicted. Include the `id` column only as an extra column, which is completely ignored by the model:",
    "metadata": {
     "collapsed": false
    }
@@ -70,10 +70,9 @@
    "execution_count": null,
    "outputs": [],
    "source": [
-    "feature_columns = set(train_table.column_names) - set([\"price\", \"id\"])\n",
+    "extra_names = [\"id\"]\n",
     "\n",
-    "train_tabular_dataset = train_table.to_tabular_dataset(\"price\", feature_names=[\n",
-    "    *feature_columns])\n"
+    "train_tabular_dataset = train_table.to_tabular_dataset(\"price\", extra_names)\n"
    ],
    "metadata": {
     "collapsed": false
@@ -147,9 +146,7 @@
     }
    ],
    "source": [
-    "test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", feature_names=[\n",
-    "    *feature_columns\n",
-    "])\n",
+    "test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", extra_names)\n",
     "\n",
     "fitted_model.mean_absolute_error(test_tabular_dataset)\n"
    ],

@@ -5,9 +5,6 @@
 
 from safeds._utils import _structural_hash
 from safeds.data.tabular.containers import Column, Table
-from safeds.exceptions import (
-    UnknownColumnNameError,
-)
 
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence
@@ -22,150 +19,67 @@ class TabularDataset:
     """
     A tabular dataset maps feature columns to a target column.
 
+    Create a tabular dataset from a mapping of column names to their values.
+
     Parameters
     ----------
     data:
         The data.
     target_name:
         Name of the target column.
-    feature_names:
-        Names of the feature columns. If None, all columns except the target column are used.
+    extra_names:
+        Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but
+        the target column are used as features.
 
     Raises
     ------
     ColumnLengthMismatchError
         If columns have different lengths.
     ValueError
-        If the target column is also a feature column.
+        If the target column is also an extra column.
     ValueError
-        If no feature columns are specified.
+        If no feature columns remains.
 
     Examples
     --------
-    >>> from safeds.data.tabular.containers import Table
-    >>> table = Table({"col1": ["a", "b"], "col2": [1, 2]})
-    >>> tabular_dataset = table.to_tabular_dataset("col2", ["col1"])
+    >>> from safeds.data.labeled.containers import TabularDataset
+    >>> dataset = TabularDataset(
+    ...     {"id": [1, 2, 3], "feature": [4, 5, 6], "target": [1, 2, 3]},
+    ...     target_name="target",
+    ...     extra_names=["id"]
+    ... )
     """
 
-    # ------------------------------------------------------------------------------------------------------------------
-    # Creation
-    # ------------------------------------------------------------------------------------------------------------------
-
-    @staticmethod
-    def _from_table(
-        table: Table,
-        target_name: str,
-        feature_names: list[str] | None = None,
-    ) -> TabularDataset:
-        """
-        Create a tabular dataset from a table.
-
-        Parameters
-        ----------
-        table:
-            The table.
-        target_name:
-            Name of the target column.
-        feature_names:
-            Names of the feature columns. If None, all columns except the target column are used.
-
-        Returns
-        -------
-        tabular_dataset:
-            The created tabular dataset.
-
-        Raises
-        ------
-        UnknownColumnNameError
-            If target_name matches none of the column names.
-        ValueError
-            If the target column is also a feature column.
-        ValueError
-            If no feature columns are specified.
-
-        Examples
-        --------
-        >>> from safeds.data.labeled.containers import TabularDataset
-        >>> from safeds.data.tabular.containers import Table
-        >>> table = Table({"col1": ["a", "b", "c", "a"], "col2": [1, 2, 3, 4]})
-        >>> tabular_dataset = TabularDataset._from_table(table, "col2", ["col1"])
-        """
-        table = table._as_table()
-        if target_name not in table.column_names:
-            raise UnknownColumnNameError([target_name])
-
-        # If no feature names are specified, use all columns except the target column
-        if feature_names is None:
-            feature_names = table.column_names
-            feature_names.remove(target_name)
-
-        # Validate inputs
-        if target_name in feature_names:
-            raise ValueError(f"Column '{target_name}' cannot be both feature and target.")
-        if len(feature_names) == 0:
-            raise ValueError("At least one feature column must be specified.")
-
-        # Create result
-        result = object.__new__(TabularDataset)
-
-        result._table = table
-        result._features = table.keep_only_columns(feature_names)
-        result._target = table.get_column(target_name)
-
-        return result
-
     # ------------------------------------------------------------------------------------------------------------------
     # Dunder methods
     # ------------------------------------------------------------------------------------------------------------------
 
     def __init__(
         self,
-        data: Mapping[str, Sequence[Any]],
+        data: Table | Mapping[str, Sequence[Any]],
         target_name: str,
-        feature_names: list[str] | None = None,
+        extra_names: list[str] | None = None,
     ):
-        """
-        Create a tabular dataset from a mapping of column names to their values.
-
-        Parameters
-        ----------
-        data:
-            The data.
-        target_name:
-            Name of the target column.
-        feature_names:
-            Names of the feature columns. If None, all columns except the target column are used.
-
-        Raises
-        ------
-        ColumnLengthMismatchError
-            If columns have different lengths.
-        ValueError
-            If the target column is also a feature column.
-        ValueError
-            If no feature columns are specified.
-
-        Examples
-        --------
-        >>> from safeds.data.labeled.containers import TabularDataset
-        >>> table = TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", ["a"])
-        """
-        self._table = Table(data)
+        # Preprocess inputs
+        if not isinstance(data, Table):
+            data = Table(data)
+        if extra_names is None:
+            extra_names = []
 
-        # If no feature names are specified, use all columns except the target column
-        if feature_names is None:
-            feature_names = self._table.column_names
-            if target_name in feature_names:
-                feature_names.remove(target_name)
+        # Derive feature names
+        feature_names = [name for name in data.column_names if name not in {target_name, *extra_names}]
 
         # Validate inputs
-        if target_name in feature_names:
-            raise ValueError(f"Column '{target_name}' cannot be both feature and target.")
+        if target_name in extra_names:
+            raise ValueError(f"Column '{target_name}' cannot be both target and extra.")
         if len(feature_names) == 0:
-            raise ValueError("At least one feature column must be specified.")
+            raise ValueError("At least one feature column must remain.")
 
-        self._features: Table = self._table.keep_only_columns(feature_names)
-        self._target: Column = self._table.get_column(target_name)
+        # Set attributes
+        self._table: Table = data
+        self._features: Table = data.keep_only_columns(feature_names)
+        self._target: Column = data.get_column(target_name)
+        self._extras: Table = data.keep_only_columns(extra_names)
 
     def __eq__(self, other: object) -> bool:
         """
@@ -210,27 +124,22 @@ def __sizeof__(self) -> int:
 
     @property
     def features(self) -> Table:
-        """
-        Get the feature columns of the tabular dataset.
-
-        Returns
-        -------
-        features:
-            The table containing the feature columns.
-        """
+        """The feature columns of the tabular dataset."""
         return self._features
 
     @property
     def target(self) -> Column:
+        """The target column of the tabular dataset."""
+        return self._target
+
+    @property
+    def extras(self) -> Table:
         """
-        Get the target column of the tabular dataset.
+        Additional columns of the tabular dataset that are neither features nor target.
 
-        Returns
-        -------
-        target:
-            The target column.
+        These can be used to store additional information about instances, such as IDs.
         """
-        return self._target
+        return self._extras
 
     # ------------------------------------------------------------------------------------------------------------------
     # Conversion

@@ -2412,7 +2412,7 @@ def to_rows(self) -> list[Row]:
             for (_, series_row) in self._data.iterrows()
         ]
 
-    def to_tabular_dataset(self, target_name: str, feature_names: list[str] | None = None) -> TabularDataset:
+    def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = None) -> TabularDataset:
         """
         Return a new `TabularDataset` with columns marked as a target column or feature columns.
 
@@ -2422,12 +2422,13 @@ def to_tabular_dataset(self, target_name: str, feature_names: list[str] | None =
         ----------
         target_name:
             Name of the target column.
-        feature_names:
-            Names of the feature columns. If None, all columns except the target column are used.
+        extra_names:
+            Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but
+            the target column are used as features.
 
         Returns
         -------
-        tabular_dataset:
+        dataset:
             A new tabular dataset with the given target and feature names.
 
         Raises
@@ -2441,11 +2442,11 @@ def to_tabular_dataset(self, target_name: str, feature_names: list[str] | None =
         --------
         >>> from safeds.data.tabular.containers import Table
         >>> table = Table({"item": ["apple", "milk", "beer"], "price": [1.10, 1.19, 1.79], "amount_bought": [74, 72, 51]})
-        >>> tabular_dataset = table.to_tabular_dataset(target_name="amount_bought", feature_names=["item", "price"])
+        >>> dataset = table.to_tabular_dataset(target_name="amount_bought", extra_names=["item"])
         """
         from safeds.data.labeled.containers import TabularDataset
 
-        return TabularDataset._from_table(self, target_name, feature_names)
+        return TabularDataset(self, target_name, extra_names)
 
     # ------------------------------------------------------------------------------------------------------------------
     # IPython integration