feat: hyperparameter optimization for classical models (#843)

Closes #264 Adjusted classical ML-Models to support taking a Choice Parameter New Features for Classifiers and Regressors: - combined Linear,Lasso,Ridge and ElasticNetRegressor into ElasticNetRegressor - changed property methods and parameter types - added fit_by_exhaustive_search(), to fit a model with all combinations of given Choices - added Errors for using the wrong fit method (fit with or fit_by_exhaustive_search without Choice Parameter) - added Enums ClassifierMetric and RegressorMetric, which are passed to fit_by_exhaustive_search to determine the optimization metric - added cross validation in fit_by_exhaustive_search - added multiprocessing in fit_by_exhaustive_search - added tests for all methods and classes --------- Co-authored-by: megalinter-bot <[email protected]> Co-authored-by: Lars Reimann <[email protected]>
Safe-DS · Aug 31, 2024 · d8f7491 · d8f7491
1 parent b1e8933
commit d8f7491
Show file tree

Hide file tree

Showing 51 changed files with 1,530 additions and 629 deletions.
diff --git a/src/safeds/ml/classical/_bases/_ada_boost_base.py b/src/safeds/ml/classical/_bases/_ada_boost_base.py
@@ -5,6 +5,7 @@
 
 from safeds._utils import _structural_hash
 from safeds._validation import _check_bounds, _ClosedBound, _OpenBound
+from safeds.ml.hyperparameters import Choice
 
 if TYPE_CHECKING:
     from safeds.ml.classical import SupervisedModel
@@ -18,16 +19,25 @@ class _AdaBoostBase(ABC):
     @abstractmethod
     def __init__(
         self,
-        max_learner_count: int,
-        learning_rate: float,
+        max_learner_count: int | Choice[int],
+        learning_rate: float | Choice[float],
     ) -> None:
         # Validation
-        _check_bounds("max_learner_count", max_learner_count, lower_bound=_ClosedBound(1))
-        _check_bounds("learning_rate", learning_rate, lower_bound=_OpenBound(0))
+        if isinstance(max_learner_count, Choice):
+            for mlc in max_learner_count:
+                _check_bounds("max_learner_count", mlc, lower_bound=_ClosedBound(1))
+        else:
+            _check_bounds("max_learner_count", max_learner_count, lower_bound=_ClosedBound(1))
+
+        if isinstance(learning_rate, Choice):
+            for lr in learning_rate:
+                _check_bounds("learning_rate", lr, lower_bound=_OpenBound(0))
+        else:
+            _check_bounds("learning_rate", learning_rate, lower_bound=_OpenBound(0))
 
         # Hyperparameters
-        self._max_learner_count: int = max_learner_count
-        self._learning_rate: float = learning_rate
+        self._max_learner_count: int | Choice[int] = max_learner_count
+        self._learning_rate: float | Choice[float] = learning_rate
 
     def __hash__(self) -> int:
         return _structural_hash(
@@ -40,16 +50,16 @@ def __hash__(self) -> int:
     # ------------------------------------------------------------------------------------------------------------------
 
     @property
-    def max_learner_count(self) -> int:
+    def max_learner_count(self) -> int | Choice[int]:
         """The maximum number of learners in the ensemble."""
         return self._max_learner_count
 
     @property
-    def learning_rate(self) -> float:
+    def learning_rate(self) -> float | Choice[float]:
         """The learning rate."""
         return self._learning_rate
 
     @property
     @abstractmethod
-    def learner(self) -> SupervisedModel | None:
+    def learner(self) -> SupervisedModel | None | Choice[SupervisedModel | None]:
         """The base learner used for training the ensemble."""
diff --git a/src/safeds/ml/classical/_bases/_decision_tree_base.py b/src/safeds/ml/classical/_bases/_decision_tree_base.py
@@ -4,6 +4,7 @@
 
 from safeds._utils import _structural_hash
 from safeds._validation import _check_bounds, _ClosedBound
+from safeds.ml.hyperparameters import Choice
 
 
 class _DecisionTreeBase(ABC):
@@ -14,20 +15,24 @@ class _DecisionTreeBase(ABC):
     @abstractmethod
     def __init__(
         self,
-        max_depth: int | None,
-        min_sample_count_in_leaves: int,
+        max_depth: int | None | Choice[int | None],
+        min_sample_count_in_leaves: int | Choice[int],
     ) -> None:
         # Validation
-        _check_bounds("max_depth", max_depth, lower_bound=_ClosedBound(1))
-        _check_bounds(
-            "min_sample_count_in_leaves",
-            min_sample_count_in_leaves,
-            lower_bound=_ClosedBound(1),
-        )
+        if isinstance(max_depth, Choice):
+            for md in max_depth:
+                _check_bounds("max_depth", md, lower_bound=_ClosedBound(1))
+        else:
+            _check_bounds("max_depth", max_depth, lower_bound=_ClosedBound(1))
+        if isinstance(min_sample_count_in_leaves, Choice):
+            for msc in min_sample_count_in_leaves:
+                _check_bounds("min_sample_count_in_leaves", msc, lower_bound=_ClosedBound(1))
+        else:
+            _check_bounds("min_sample_count_in_leaves", min_sample_count_in_leaves, lower_bound=_ClosedBound(1))
 
         # Hyperparameters
-        self._max_depth: int | None = max_depth
-        self._min_sample_count_in_leaves: int = min_sample_count_in_leaves
+        self._max_depth: int | None | Choice[int | None] = max_depth
+        self._min_sample_count_in_leaves: int | Choice[int] = min_sample_count_in_leaves
 
     def __hash__(self) -> int:
         return _structural_hash(
@@ -40,11 +45,11 @@ def __hash__(self) -> int:
     # ------------------------------------------------------------------------------------------------------------------
 
     @property
-    def max_depth(self) -> int | None:
+    def max_depth(self) -> int | None | Choice[int | None]:
         """The maximum depth of the tree."""
         return self._max_depth
 
     @property
-    def min_sample_count_in_leaves(self) -> int:
+    def min_sample_count_in_leaves(self) -> int | Choice[int]:
         """The minimum number of samples that must remain in the leaves of the tree."""
         return self._min_sample_count_in_leaves
diff --git a/src/safeds/ml/classical/_bases/_gradient_boosting_base.py b/src/safeds/ml/classical/_bases/_gradient_boosting_base.py
@@ -4,6 +4,7 @@
 
 from safeds._utils import _structural_hash
 from safeds._validation import _check_bounds, _ClosedBound, _OpenBound
+from safeds.ml.hyperparameters import Choice
 
 
 class _GradientBoostingBase(ABC):
@@ -14,12 +15,21 @@ class _GradientBoostingBase(ABC):
     @abstractmethod
     def __init__(
         self,
-        tree_count: int,
-        learning_rate: float,
+        tree_count: int | Choice[int],
+        learning_rate: float | Choice[float],
     ) -> None:
         # Validation
-        _check_bounds("tree_count", tree_count, lower_bound=_ClosedBound(1))
-        _check_bounds("learning_rate", learning_rate, lower_bound=_OpenBound(0))
+        if isinstance(tree_count, Choice):
+            for tc in tree_count:
+                _check_bounds("tree_count", tc, lower_bound=_ClosedBound(1))
+        else:
+            _check_bounds("tree_count", tree_count, lower_bound=_ClosedBound(1))
+
+        if isinstance(learning_rate, Choice):
+            for lr in learning_rate:
+                _check_bounds("learning_rate", lr, lower_bound=_OpenBound(0))
+        else:
+            _check_bounds("learning_rate", learning_rate, lower_bound=_OpenBound(0))
 
         # Hyperparameters
         self._tree_count = tree_count
@@ -36,11 +46,11 @@ def __hash__(self) -> int:
     # ------------------------------------------------------------------------------------------------------------------
 
     @property
-    def tree_count(self) -> int:
+    def tree_count(self) -> int | Choice[int]:
         """The number of trees (estimators) in the ensemble."""
         return self._tree_count
 
     @property
-    def learning_rate(self) -> float:
+    def learning_rate(self) -> float | Choice[float]:
         """The learning rate."""
         return self._learning_rate
diff --git a/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py b/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py
@@ -4,6 +4,7 @@
 
 from safeds._utils import _structural_hash
 from safeds._validation import _check_bounds, _ClosedBound
+from safeds.ml.hyperparameters import Choice
 
 
 class _KNearestNeighborsBase(ABC):
@@ -14,10 +15,14 @@ class _KNearestNeighborsBase(ABC):
     @abstractmethod
     def __init__(
         self,
-        neighbor_count: int,
+        neighbor_count: int | Choice[int],
     ) -> None:
         # Validation
-        _check_bounds("neighbor_count", neighbor_count, lower_bound=_ClosedBound(1))
+        if isinstance(neighbor_count, Choice):
+            for nc in neighbor_count:
+                _check_bounds("neighbor_count", nc, lower_bound=_ClosedBound(1))
+        else:
+            _check_bounds("neighbor_count", neighbor_count, lower_bound=_ClosedBound(1))
 
         # Hyperparameters
         self._neighbor_count = neighbor_count
@@ -32,6 +37,6 @@ def __hash__(self) -> int:
     # ------------------------------------------------------------------------------------------------------------------
 
     @property
-    def neighbor_count(self) -> int:
+    def neighbor_count(self) -> int | Choice[int]:
         """The number of neighbors used for interpolation."""
         return self._neighbor_count
diff --git a/src/safeds/ml/classical/_bases/_random_forest_base.py b/src/safeds/ml/classical/_bases/_random_forest_base.py
@@ -4,6 +4,7 @@
 
 from safeds._utils import _structural_hash
 from safeds._validation import _check_bounds, _ClosedBound
+from safeds.ml.hyperparameters import Choice
 
 
 class _RandomForestBase(ABC):
@@ -14,23 +15,33 @@ class _RandomForestBase(ABC):
     @abstractmethod
     def __init__(
         self,
-        tree_count: int,
-        max_depth: int | None,
-        min_sample_count_in_leaves: int,
+        tree_count: int | Choice[int],
+        max_depth: int | None | Choice[int | None],
+        min_sample_count_in_leaves: int | Choice[int],
     ) -> None:
         # Validation
-        _check_bounds("tree_count", tree_count, lower_bound=_ClosedBound(1))
-        _check_bounds("max_depth", max_depth, lower_bound=_ClosedBound(1))
-        _check_bounds(
-            "min_sample_count_in_leaves",
-            min_sample_count_in_leaves,
-            lower_bound=_ClosedBound(1),
-        )
+        if isinstance(tree_count, Choice):
+            for tc in tree_count:
+                _check_bounds("tree_count", tc, lower_bound=_ClosedBound(1))
+        else:
+            _check_bounds("tree_count", tree_count, lower_bound=_ClosedBound(1))
+
+        if isinstance(max_depth, Choice):
+            for md in max_depth:
+                _check_bounds("max_depth", md, lower_bound=_ClosedBound(1))
+        else:
+            _check_bounds("max_depth", max_depth, lower_bound=_ClosedBound(1))
+
+        if isinstance(min_sample_count_in_leaves, Choice):
+            for msc in min_sample_count_in_leaves:
+                _check_bounds("min_sample_count_in_leaves", msc, lower_bound=_ClosedBound(1))
+        else:
+            _check_bounds("min_sample_count_in_leaves", min_sample_count_in_leaves, lower_bound=_ClosedBound(1))
 
         # Hyperparameters
-        self._tree_count: int = tree_count
-        self._max_depth: int | None = max_depth
-        self._min_sample_count_in_leaves: int = min_sample_count_in_leaves
+        self._tree_count: int | Choice[int] = tree_count
+        self._max_depth: int | None | Choice[int | None] = max_depth
+        self._min_sample_count_in_leaves: int | Choice[int] = min_sample_count_in_leaves
 
     def __hash__(self) -> int:
         return _structural_hash(
@@ -44,16 +55,16 @@ def __hash__(self) -> int:
     # ------------------------------------------------------------------------------------------------------------------
 
     @property
-    def tree_count(self) -> int:
+    def tree_count(self) -> int | Choice[int]:
         """The number of trees used in the random forest."""
         return self._tree_count
 
     @property
-    def max_depth(self) -> int | None:
+    def max_depth(self) -> int | None | Choice[int | None]:
         """The maximum depth of each tree."""
         return self._max_depth
 
     @property
-    def min_sample_count_in_leaves(self) -> int:
+    def min_sample_count_in_leaves(self) -> int | Choice[int]:
         """The minimum number of samples that must remain in the leaves of each tree."""
         return self._min_sample_count_in_leaves
diff --git a/src/safeds/ml/classical/_bases/_support_vector_machine_base.py b/src/safeds/ml/classical/_bases/_support_vector_machine_base.py
@@ -6,6 +6,7 @@
 
 from safeds._utils import _structural_hash
 from safeds._validation import _check_bounds, _ClosedBound, _OpenBound
+from safeds.ml.hyperparameters import Choice
 
 if TYPE_CHECKING:
     from sklearn.svm import SVC as SklearnSVC  # noqa: N811
@@ -76,18 +77,22 @@ def sigmoid() -> _SupportVectorMachineBase.Kernel:
     @abstractmethod
     def __init__(
         self,
-        c: float,
-        kernel: _SupportVectorMachineBase.Kernel | None,
+        c: float | Choice[float],
+        kernel: _SupportVectorMachineBase.Kernel | None | Choice[_SupportVectorMachineBase.Kernel | None],
     ) -> None:
         if kernel is None:
             kernel = _SupportVectorMachineBase.Kernel.radial_basis_function()
 
         # Validation
-        _check_bounds("c", c, lower_bound=_OpenBound(0))
+        if isinstance(c, Choice):
+            for value in c:
+                _check_bounds("c", value, lower_bound=_OpenBound(0))
+        else:
+            _check_bounds("c", c, lower_bound=_OpenBound(0))
 
         # Hyperparameters
-        self._c: float = c
-        self._kernel: _SupportVectorMachineBase.Kernel = kernel
+        self._c: float | Choice[float] = c
+        self._kernel: _SupportVectorMachineBase.Kernel | Choice[_SupportVectorMachineBase.Kernel | None] = kernel
 
     def __hash__(self) -> int:
         return _structural_hash(
@@ -100,14 +105,14 @@ def __hash__(self) -> int:
     # ------------------------------------------------------------------------------------------------------------------
 
     @property
-    def c(self) -> float:
+    def c(self) -> float | Choice[float]:
         """The regularization strength."""
         return self._c
 
     # This property is abstract, so subclasses must declare a public return type.
     @property
     @abstractmethod
-    def kernel(self) -> _SupportVectorMachineBase.Kernel:
+    def kernel(self) -> _SupportVectorMachineBase.Kernel | Choice[_SupportVectorMachineBase.Kernel | None]:
         """The type of kernel used."""
 
 

diff --git a/src/safeds/ml/classical/_supervised_model.py b/src/safeds/ml/classical/_supervised_model.py
@@ -80,6 +80,12 @@ def fit(self, training_set: TabularDataset) -> Self:
 
         Raises
         ------
+        PlainTableError
+            If a table is passed instead of a TabularDataset.
+        DatasetMissesDataError
+            If the given training set contains no data.
+        FittingWithChoiceError
+            When trying to call this method on a model with hyperparameter choices.
         LearningError
             If the training data contains invalid values or if the training failed.
         """
@@ -88,7 +94,8 @@ def fit(self, training_set: TabularDataset) -> Self:
         if training_set.to_table().row_count == 0:
             raise DatasetMissesDataError
 
-        self._check_additional_fit_preconditions(training_set)
+        self._check_additional_fit_preconditions()
+        self._check_more_additional_fit_preconditions(training_set)
 
         wrapped_model = self._get_sklearn_model()
         _fit_sklearn_model_in_place(wrapped_model, training_set)
@@ -234,15 +241,14 @@ def get_target_type(self) -> DataType:
     # Template methods
     # ------------------------------------------------------------------------------------------------------------------
 
-    def _check_additional_fit_preconditions(self, training_set: TabularDataset) -> None:  # noqa: B027
-        """
-        Check additional preconditions for fitting the model and raise an error if any are violated.
+    def _check_additional_fit_preconditions(self) -> None:  # noqa: B027
+        """Check additional preconditions for fitting the model and raise an error if any are violated."""
 
-        Parameters
-        ----------
-        training_set:
-            The training data containing the features and target.
-        """
+    def _check_more_additional_fit_preconditions(self, training_set: TabularDataset) -> None:  # noqa: B027
+        """Check additional preconditions for fitting the model and raise an error if any are violated."""
+
+    def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None:  # noqa: B027
+        """Check additional preconditions for fitting by exhaustive search and raise an error if any are violated."""
 
     def _check_additional_predict_preconditions(self, dataset: Table | TabularDataset) -> None:  # noqa: B027
         """
@@ -254,6 +260,10 @@ def _check_additional_predict_preconditions(self, dataset: Table | TabularDatase
             The dataset containing at least the features.
         """
 
+    def _get_models_for_all_choices(self) -> list[Self]:
+        """Get a list of all possible models, given the Parameter Choices."""
+        raise NotImplementedError  # pragma: no cover
+
     @abstractmethod
     def _clone(self) -> Self:
         """