From d8f7491fa556def596ce7b006053a4c03e6b9e3d Mon Sep 17 00:00:00 2001 From: Simon Breuer <86068340+sibre28@users.noreply.github.com> Date: Sat, 31 Aug 2024 15:19:51 +0200 Subject: [PATCH] feat: hyperparameter optimization for classical models (#843) Closes #264 Adjusted classical ML-Models to support taking a Choice Parameter New Features for Classifiers and Regressors: - combined Linear,Lasso,Ridge and ElasticNetRegressor into ElasticNetRegressor - changed property methods and parameter types - added fit_by_exhaustive_search(), to fit a model with all combinations of given Choices - added Errors for using the wrong fit method (fit with or fit_by_exhaustive_search without Choice Parameter) - added Enums ClassifierMetric and RegressorMetric, which are passed to fit_by_exhaustive_search to determine the optimization metric - added cross validation in fit_by_exhaustive_search - added multiprocessing in fit_by_exhaustive_search - added tests for all methods and classes --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Co-authored-by: Lars Reimann --- .../ml/classical/_bases/_ada_boost_base.py | 28 +- .../classical/_bases/_decision_tree_base.py | 29 +- .../_bases/_gradient_boosting_base.py | 22 +- .../_bases/_k_nearest_neighbors_base.py | 11 +- .../classical/_bases/_random_forest_base.py | 43 ++- .../_bases/_support_vector_machine_base.py | 19 +- src/safeds/ml/classical/_supervised_model.py | 28 +- .../classification/_ada_boost_classifier.py | 46 ++- .../classical/classification/_classifier.py | 115 +++++- .../_decision_tree_classifier.py | 28 +- .../_gradient_boosting_classifier.py | 26 +- .../_k_nearest_neighbors_classifier.py | 38 +- .../classification/_logistic_classifier.py | 4 + .../_random_forest_classifier.py | 40 +- .../_support_vector_classifier.py | 28 +- .../ml/classical/regression/__init__.py | 9 - .../regression/_ada_boost_regressor.py | 45 ++- .../regression/_baseline_regressor.py | 11 +- .../regression/_decision_tree_regressor.py | 28 +- .../regression/_elastic_net_regressor.py | 111 ------ .../_gradient_boosting_regressor.py | 26 +- .../_k_nearest_neighbors_regressor.py | 35 +- .../classical/regression/_lasso_regressor.py | 85 ----- .../classical/regression/_linear_regressor.py | 349 +++++++++++++++++- .../regression/_random_forest_regressor.py | 40 +- .../ml/classical/regression/_regressor.py | 108 +++++- .../classical/regression/_ridge_regressor.py | 84 ----- .../regression/_support_vector_regressor.py | 28 +- src/safeds/ml/hyperparameters/_choice.py | 3 +- src/safeds/ml/metrics/__init__.py | 6 + src/safeds/ml/metrics/_classifier_metric.py | 10 + src/safeds/ml/metrics/_regressor_metric.py | 10 + .../classification/test_ada_boost.py | 9 +- .../classification/test_classifier.py | 127 ++++++- .../classification/test_decision_tree.py | 13 +- .../classification/test_gradient_boosting.py | 9 +- .../test_k_nearest_neighbors.py | 5 +- .../classification/test_random_forest.py | 17 +- .../test_support_vector_machine.py | 5 +- .../ml/classical/regression/test_ada_boost.py | 9 +- .../classical/regression/test_arima_model.py | 4 +- .../regression/test_decision_tree.py | 13 +- .../regression/test_elastic_net_regression.py | 73 ---- .../regression/test_gradient_boosting.py | 9 +- .../regression/test_k_nearest_neighbors.py | 5 +- .../regression/test_lasso_regression.py | 37 -- .../regression/test_linear_regressor.py | 160 ++++++++ .../regression/test_random_forest.py | 17 +- .../ml/classical/regression/test_regressor.py | 112 +++++- .../regression/test_ridge_regression.py | 37 -- .../regression/test_support_vector_machine.py | 5 +- 51 files changed, 1530 insertions(+), 629 deletions(-) delete mode 100644 src/safeds/ml/classical/regression/_elastic_net_regressor.py delete mode 100644 src/safeds/ml/classical/regression/_lasso_regressor.py delete mode 100644 src/safeds/ml/classical/regression/_ridge_regressor.py create mode 100644 src/safeds/ml/metrics/_classifier_metric.py create mode 100644 src/safeds/ml/metrics/_regressor_metric.py delete mode 100644 tests/safeds/ml/classical/regression/test_elastic_net_regression.py delete mode 100644 tests/safeds/ml/classical/regression/test_lasso_regression.py create mode 100644 tests/safeds/ml/classical/regression/test_linear_regressor.py delete mode 100644 tests/safeds/ml/classical/regression/test_ridge_regression.py diff --git a/src/safeds/ml/classical/_bases/_ada_boost_base.py b/src/safeds/ml/classical/_bases/_ada_boost_base.py index 5cacf341e..28de75da6 100644 --- a/src/safeds/ml/classical/_bases/_ada_boost_base.py +++ b/src/safeds/ml/classical/_bases/_ada_boost_base.py @@ -5,6 +5,7 @@ from safeds._utils import _structural_hash from safeds._validation import _check_bounds, _ClosedBound, _OpenBound +from safeds.ml.hyperparameters import Choice if TYPE_CHECKING: from safeds.ml.classical import SupervisedModel @@ -18,16 +19,25 @@ class _AdaBoostBase(ABC): @abstractmethod def __init__( self, - max_learner_count: int, - learning_rate: float, + max_learner_count: int | Choice[int], + learning_rate: float | Choice[float], ) -> None: # Validation - _check_bounds("max_learner_count", max_learner_count, lower_bound=_ClosedBound(1)) - _check_bounds("learning_rate", learning_rate, lower_bound=_OpenBound(0)) + if isinstance(max_learner_count, Choice): + for mlc in max_learner_count: + _check_bounds("max_learner_count", mlc, lower_bound=_ClosedBound(1)) + else: + _check_bounds("max_learner_count", max_learner_count, lower_bound=_ClosedBound(1)) + + if isinstance(learning_rate, Choice): + for lr in learning_rate: + _check_bounds("learning_rate", lr, lower_bound=_OpenBound(0)) + else: + _check_bounds("learning_rate", learning_rate, lower_bound=_OpenBound(0)) # Hyperparameters - self._max_learner_count: int = max_learner_count - self._learning_rate: float = learning_rate + self._max_learner_count: int | Choice[int] = max_learner_count + self._learning_rate: float | Choice[float] = learning_rate def __hash__(self) -> int: return _structural_hash( @@ -40,16 +50,16 @@ def __hash__(self) -> int: # ------------------------------------------------------------------------------------------------------------------ @property - def max_learner_count(self) -> int: + def max_learner_count(self) -> int | Choice[int]: """The maximum number of learners in the ensemble.""" return self._max_learner_count @property - def learning_rate(self) -> float: + def learning_rate(self) -> float | Choice[float]: """The learning rate.""" return self._learning_rate @property @abstractmethod - def learner(self) -> SupervisedModel | None: + def learner(self) -> SupervisedModel | None | Choice[SupervisedModel | None]: """The base learner used for training the ensemble.""" diff --git a/src/safeds/ml/classical/_bases/_decision_tree_base.py b/src/safeds/ml/classical/_bases/_decision_tree_base.py index 0b5d22823..5502a5edb 100644 --- a/src/safeds/ml/classical/_bases/_decision_tree_base.py +++ b/src/safeds/ml/classical/_bases/_decision_tree_base.py @@ -4,6 +4,7 @@ from safeds._utils import _structural_hash from safeds._validation import _check_bounds, _ClosedBound +from safeds.ml.hyperparameters import Choice class _DecisionTreeBase(ABC): @@ -14,20 +15,24 @@ class _DecisionTreeBase(ABC): @abstractmethod def __init__( self, - max_depth: int | None, - min_sample_count_in_leaves: int, + max_depth: int | None | Choice[int | None], + min_sample_count_in_leaves: int | Choice[int], ) -> None: # Validation - _check_bounds("max_depth", max_depth, lower_bound=_ClosedBound(1)) - _check_bounds( - "min_sample_count_in_leaves", - min_sample_count_in_leaves, - lower_bound=_ClosedBound(1), - ) + if isinstance(max_depth, Choice): + for md in max_depth: + _check_bounds("max_depth", md, lower_bound=_ClosedBound(1)) + else: + _check_bounds("max_depth", max_depth, lower_bound=_ClosedBound(1)) + if isinstance(min_sample_count_in_leaves, Choice): + for msc in min_sample_count_in_leaves: + _check_bounds("min_sample_count_in_leaves", msc, lower_bound=_ClosedBound(1)) + else: + _check_bounds("min_sample_count_in_leaves", min_sample_count_in_leaves, lower_bound=_ClosedBound(1)) # Hyperparameters - self._max_depth: int | None = max_depth - self._min_sample_count_in_leaves: int = min_sample_count_in_leaves + self._max_depth: int | None | Choice[int | None] = max_depth + self._min_sample_count_in_leaves: int | Choice[int] = min_sample_count_in_leaves def __hash__(self) -> int: return _structural_hash( @@ -40,11 +45,11 @@ def __hash__(self) -> int: # ------------------------------------------------------------------------------------------------------------------ @property - def max_depth(self) -> int | None: + def max_depth(self) -> int | None | Choice[int | None]: """The maximum depth of the tree.""" return self._max_depth @property - def min_sample_count_in_leaves(self) -> int: + def min_sample_count_in_leaves(self) -> int | Choice[int]: """The minimum number of samples that must remain in the leaves of the tree.""" return self._min_sample_count_in_leaves diff --git a/src/safeds/ml/classical/_bases/_gradient_boosting_base.py b/src/safeds/ml/classical/_bases/_gradient_boosting_base.py index 63a1370a0..df1831b74 100644 --- a/src/safeds/ml/classical/_bases/_gradient_boosting_base.py +++ b/src/safeds/ml/classical/_bases/_gradient_boosting_base.py @@ -4,6 +4,7 @@ from safeds._utils import _structural_hash from safeds._validation import _check_bounds, _ClosedBound, _OpenBound +from safeds.ml.hyperparameters import Choice class _GradientBoostingBase(ABC): @@ -14,12 +15,21 @@ class _GradientBoostingBase(ABC): @abstractmethod def __init__( self, - tree_count: int, - learning_rate: float, + tree_count: int | Choice[int], + learning_rate: float | Choice[float], ) -> None: # Validation - _check_bounds("tree_count", tree_count, lower_bound=_ClosedBound(1)) - _check_bounds("learning_rate", learning_rate, lower_bound=_OpenBound(0)) + if isinstance(tree_count, Choice): + for tc in tree_count: + _check_bounds("tree_count", tc, lower_bound=_ClosedBound(1)) + else: + _check_bounds("tree_count", tree_count, lower_bound=_ClosedBound(1)) + + if isinstance(learning_rate, Choice): + for lr in learning_rate: + _check_bounds("learning_rate", lr, lower_bound=_OpenBound(0)) + else: + _check_bounds("learning_rate", learning_rate, lower_bound=_OpenBound(0)) # Hyperparameters self._tree_count = tree_count @@ -36,11 +46,11 @@ def __hash__(self) -> int: # ------------------------------------------------------------------------------------------------------------------ @property - def tree_count(self) -> int: + def tree_count(self) -> int | Choice[int]: """The number of trees (estimators) in the ensemble.""" return self._tree_count @property - def learning_rate(self) -> float: + def learning_rate(self) -> float | Choice[float]: """The learning rate.""" return self._learning_rate diff --git a/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py b/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py index 2113c4d9e..3f52ebb28 100644 --- a/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py +++ b/src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py @@ -4,6 +4,7 @@ from safeds._utils import _structural_hash from safeds._validation import _check_bounds, _ClosedBound +from safeds.ml.hyperparameters import Choice class _KNearestNeighborsBase(ABC): @@ -14,10 +15,14 @@ class _KNearestNeighborsBase(ABC): @abstractmethod def __init__( self, - neighbor_count: int, + neighbor_count: int | Choice[int], ) -> None: # Validation - _check_bounds("neighbor_count", neighbor_count, lower_bound=_ClosedBound(1)) + if isinstance(neighbor_count, Choice): + for nc in neighbor_count: + _check_bounds("neighbor_count", nc, lower_bound=_ClosedBound(1)) + else: + _check_bounds("neighbor_count", neighbor_count, lower_bound=_ClosedBound(1)) # Hyperparameters self._neighbor_count = neighbor_count @@ -32,6 +37,6 @@ def __hash__(self) -> int: # ------------------------------------------------------------------------------------------------------------------ @property - def neighbor_count(self) -> int: + def neighbor_count(self) -> int | Choice[int]: """The number of neighbors used for interpolation.""" return self._neighbor_count diff --git a/src/safeds/ml/classical/_bases/_random_forest_base.py b/src/safeds/ml/classical/_bases/_random_forest_base.py index 56786e6ad..85e2a1acc 100644 --- a/src/safeds/ml/classical/_bases/_random_forest_base.py +++ b/src/safeds/ml/classical/_bases/_random_forest_base.py @@ -4,6 +4,7 @@ from safeds._utils import _structural_hash from safeds._validation import _check_bounds, _ClosedBound +from safeds.ml.hyperparameters import Choice class _RandomForestBase(ABC): @@ -14,23 +15,33 @@ class _RandomForestBase(ABC): @abstractmethod def __init__( self, - tree_count: int, - max_depth: int | None, - min_sample_count_in_leaves: int, + tree_count: int | Choice[int], + max_depth: int | None | Choice[int | None], + min_sample_count_in_leaves: int | Choice[int], ) -> None: # Validation - _check_bounds("tree_count", tree_count, lower_bound=_ClosedBound(1)) - _check_bounds("max_depth", max_depth, lower_bound=_ClosedBound(1)) - _check_bounds( - "min_sample_count_in_leaves", - min_sample_count_in_leaves, - lower_bound=_ClosedBound(1), - ) + if isinstance(tree_count, Choice): + for tc in tree_count: + _check_bounds("tree_count", tc, lower_bound=_ClosedBound(1)) + else: + _check_bounds("tree_count", tree_count, lower_bound=_ClosedBound(1)) + + if isinstance(max_depth, Choice): + for md in max_depth: + _check_bounds("max_depth", md, lower_bound=_ClosedBound(1)) + else: + _check_bounds("max_depth", max_depth, lower_bound=_ClosedBound(1)) + + if isinstance(min_sample_count_in_leaves, Choice): + for msc in min_sample_count_in_leaves: + _check_bounds("min_sample_count_in_leaves", msc, lower_bound=_ClosedBound(1)) + else: + _check_bounds("min_sample_count_in_leaves", min_sample_count_in_leaves, lower_bound=_ClosedBound(1)) # Hyperparameters - self._tree_count: int = tree_count - self._max_depth: int | None = max_depth - self._min_sample_count_in_leaves: int = min_sample_count_in_leaves + self._tree_count: int | Choice[int] = tree_count + self._max_depth: int | None | Choice[int | None] = max_depth + self._min_sample_count_in_leaves: int | Choice[int] = min_sample_count_in_leaves def __hash__(self) -> int: return _structural_hash( @@ -44,16 +55,16 @@ def __hash__(self) -> int: # ------------------------------------------------------------------------------------------------------------------ @property - def tree_count(self) -> int: + def tree_count(self) -> int | Choice[int]: """The number of trees used in the random forest.""" return self._tree_count @property - def max_depth(self) -> int | None: + def max_depth(self) -> int | None | Choice[int | None]: """The maximum depth of each tree.""" return self._max_depth @property - def min_sample_count_in_leaves(self) -> int: + def min_sample_count_in_leaves(self) -> int | Choice[int]: """The minimum number of samples that must remain in the leaves of each tree.""" return self._min_sample_count_in_leaves diff --git a/src/safeds/ml/classical/_bases/_support_vector_machine_base.py b/src/safeds/ml/classical/_bases/_support_vector_machine_base.py index fc85a4b58..05047531d 100644 --- a/src/safeds/ml/classical/_bases/_support_vector_machine_base.py +++ b/src/safeds/ml/classical/_bases/_support_vector_machine_base.py @@ -6,6 +6,7 @@ from safeds._utils import _structural_hash from safeds._validation import _check_bounds, _ClosedBound, _OpenBound +from safeds.ml.hyperparameters import Choice if TYPE_CHECKING: from sklearn.svm import SVC as SklearnSVC # noqa: N811 @@ -76,18 +77,22 @@ def sigmoid() -> _SupportVectorMachineBase.Kernel: @abstractmethod def __init__( self, - c: float, - kernel: _SupportVectorMachineBase.Kernel | None, + c: float | Choice[float], + kernel: _SupportVectorMachineBase.Kernel | None | Choice[_SupportVectorMachineBase.Kernel | None], ) -> None: if kernel is None: kernel = _SupportVectorMachineBase.Kernel.radial_basis_function() # Validation - _check_bounds("c", c, lower_bound=_OpenBound(0)) + if isinstance(c, Choice): + for value in c: + _check_bounds("c", value, lower_bound=_OpenBound(0)) + else: + _check_bounds("c", c, lower_bound=_OpenBound(0)) # Hyperparameters - self._c: float = c - self._kernel: _SupportVectorMachineBase.Kernel = kernel + self._c: float | Choice[float] = c + self._kernel: _SupportVectorMachineBase.Kernel | Choice[_SupportVectorMachineBase.Kernel | None] = kernel def __hash__(self) -> int: return _structural_hash( @@ -100,14 +105,14 @@ def __hash__(self) -> int: # ------------------------------------------------------------------------------------------------------------------ @property - def c(self) -> float: + def c(self) -> float | Choice[float]: """The regularization strength.""" return self._c # This property is abstract, so subclasses must declare a public return type. @property @abstractmethod - def kernel(self) -> _SupportVectorMachineBase.Kernel: + def kernel(self) -> _SupportVectorMachineBase.Kernel | Choice[_SupportVectorMachineBase.Kernel | None]: """The type of kernel used.""" diff --git a/src/safeds/ml/classical/_supervised_model.py b/src/safeds/ml/classical/_supervised_model.py index 8c5e966f9..927db02bd 100644 --- a/src/safeds/ml/classical/_supervised_model.py +++ b/src/safeds/ml/classical/_supervised_model.py @@ -80,6 +80,12 @@ def fit(self, training_set: TabularDataset) -> Self: Raises ------ + PlainTableError + If a table is passed instead of a TabularDataset. + DatasetMissesDataError + If the given training set contains no data. + FittingWithChoiceError + When trying to call this method on a model with hyperparameter choices. LearningError If the training data contains invalid values or if the training failed. """ @@ -88,7 +94,8 @@ def fit(self, training_set: TabularDataset) -> Self: if training_set.to_table().row_count == 0: raise DatasetMissesDataError - self._check_additional_fit_preconditions(training_set) + self._check_additional_fit_preconditions() + self._check_more_additional_fit_preconditions(training_set) wrapped_model = self._get_sklearn_model() _fit_sklearn_model_in_place(wrapped_model, training_set) @@ -234,15 +241,14 @@ def get_target_type(self) -> DataType: # Template methods # ------------------------------------------------------------------------------------------------------------------ - def _check_additional_fit_preconditions(self, training_set: TabularDataset) -> None: # noqa: B027 - """ - Check additional preconditions for fitting the model and raise an error if any are violated. + def _check_additional_fit_preconditions(self) -> None: # noqa: B027 + """Check additional preconditions for fitting the model and raise an error if any are violated.""" - Parameters - ---------- - training_set: - The training data containing the features and target. - """ + def _check_more_additional_fit_preconditions(self, training_set: TabularDataset) -> None: # noqa: B027 + """Check additional preconditions for fitting the model and raise an error if any are violated.""" + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: # noqa: B027 + """Check additional preconditions for fitting by exhaustive search and raise an error if any are violated.""" def _check_additional_predict_preconditions(self, dataset: Table | TabularDataset) -> None: # noqa: B027 """ @@ -254,6 +260,10 @@ def _check_additional_predict_preconditions(self, dataset: Table | TabularDatase The dataset containing at least the features. """ + def _get_models_for_all_choices(self) -> list[Self]: + """Get a list of all possible models, given the Parameter Choices.""" + raise NotImplementedError # pragma: no cover + @abstractmethod def _clone(self) -> Self: """ diff --git a/src/safeds/ml/classical/classification/_ada_boost_classifier.py b/src/safeds/ml/classical/classification/_ada_boost_classifier.py index e8fa50ae9..3dfb32b60 100644 --- a/src/safeds/ml/classical/classification/_ada_boost_classifier.py +++ b/src/safeds/ml/classical/classification/_ada_boost_classifier.py @@ -3,7 +3,9 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError from safeds.ml.classical._bases import _AdaBoostBase +from safeds.ml.hyperparameters import Choice from ._classifier import Classifier @@ -39,9 +41,9 @@ class AdaBoostClassifier(Classifier, _AdaBoostBase): def __init__( self, *, - learner: Classifier | None = None, - max_learner_count: int = 50, - learning_rate: float = 1.0, + learner: Classifier | None | Choice[Classifier | None] = None, + max_learner_count: int | Choice[int] = 50, + learning_rate: float | Choice[float] = 1.0, ) -> None: # Initialize superclasses Classifier.__init__(self) @@ -52,7 +54,7 @@ def __init__( ) # Hyperparameters - self._learner: Classifier | None = learner + self._learner: Classifier | None | Choice[Classifier | None] = learner def __hash__(self) -> int: return _structural_hash( @@ -66,7 +68,7 @@ def __hash__(self) -> int: # ------------------------------------------------------------------------------------------------------------------ @property - def learner(self) -> Classifier | None: + def learner(self) -> Classifier | None | Choice[Classifier | None]: # type: ignore[override] """The base learner used for training the ensemble.""" return self._learner @@ -84,9 +86,43 @@ def _clone(self) -> AdaBoostClassifier: def _get_sklearn_model(self) -> ClassifierMixin: from sklearn.ensemble import AdaBoostClassifier as SklearnAdaBoostClassifier + assert not isinstance(self.learner, Choice) learner = self.learner._get_sklearn_model() if self.learner is not None else None return SklearnAdaBoostClassifier( estimator=learner, n_estimators=self._max_learner_count, learning_rate=self._learning_rate, + algorithm="SAMME", # Will be the default in sklearn 1.6, remove this line then ) + + def _check_additional_fit_preconditions(self) -> None: + if ( + isinstance(self._max_learner_count, Choice) + or isinstance(self._learning_rate, Choice) + or isinstance(self._learner, Choice) + ): + raise FittingWithChoiceError + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if ( + not isinstance(self._max_learner_count, Choice) + and not isinstance(self._learning_rate, Choice) + and not isinstance(self._learner, Choice) + ): + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[AdaBoostClassifier]: + learner_choices = self._learner if isinstance(self._learner, Choice) else [self._learner] + max_learner_count_choices = ( + self._max_learner_count if isinstance(self._max_learner_count, Choice) else [self._max_learner_count] + ) + learning_rate_choices = ( + self._learning_rate if isinstance(self._learning_rate, Choice) else [self._learning_rate] + ) + + models = [] + for learner in learner_choices: + for mlc in max_learner_count_choices: + for lr in learning_rate_choices: + models.append(AdaBoostClassifier(learner=learner, max_learner_count=mlc, learning_rate=lr)) + return models diff --git a/src/safeds/ml/classical/classification/_classifier.py b/src/safeds/ml/classical/classification/_classifier.py index c05159d69..b87c5a5af 100644 --- a/src/safeds/ml/classical/classification/_classifier.py +++ b/src/safeds/ml/classical/classification/_classifier.py @@ -1,12 +1,15 @@ from __future__ import annotations from abc import ABC -from typing import TYPE_CHECKING +from concurrent.futures import ALL_COMPLETED, ProcessPoolExecutor, wait +from typing import TYPE_CHECKING, Self + +from joblib._multiprocessing_helpers import mp from safeds.data.labeled.containers import TabularDataset -from safeds.exceptions import ModelNotFittedError +from safeds.exceptions import DatasetMissesDataError, LearningError, ModelNotFittedError from safeds.ml.classical import SupervisedModel -from safeds.ml.metrics import ClassificationMetrics +from safeds.ml.metrics import ClassificationMetrics, ClassifierMetric if TYPE_CHECKING: from typing import Any @@ -212,6 +215,112 @@ def recall(self, validation_or_test_set: Table | TabularDataset, positive_class: positive_class, ) + def fit_by_exhaustive_search( + self, + training_set: TabularDataset, + optimization_metric: ClassifierMetric, + positive_class: Any = None, + ) -> Self: + """ + Use the hyperparameter choices to create multiple models and fit them. + + **Note:** This model is not modified. + + Parameters + ---------- + training_set: + The training data containing the features and target. + optimization_metric: + The metric that should be used for determining the performance of a model. + positive_class: + The class to be considered positive. All other classes are considered negative. + Needs to be provided when choosing precision, f1score or recall as optimization metric. + + Returns + ------- + best_model: + The model that performed the best out of all possible models given the Choices of hyperparameters. + + Raises + ------ + PlainTableError + If a table is passed instead of a TabularDataset. + DatasetMissesDataError + If the given training set contains no data. + FittingWithoutChoiceError + When trying to call this method on a model without hyperparameter choices. + LearningError + If the training data contains invalid values or if the training failed. + """ + if training_set.to_table().row_count == 0: + raise DatasetMissesDataError + if optimization_metric.value in {"precision", "recall", "f1score"} and positive_class is None: + raise LearningError( + f"Please provide a positive class when using optimization metric '{optimization_metric.value}'", + ) + + self._check_additional_fit_by_exhaustive_search_preconditions() + + [train_split, test_split] = training_set.to_table().split_rows(0.75) + train_data = train_split.to_tabular_dataset( + target_name=training_set.target.name, + extra_names=training_set.extras.column_names, + ) + test_data = test_split.to_tabular_dataset( + target_name=training_set.target.name, + extra_names=training_set.extras.column_names, + ) + + list_of_models = self._get_models_for_all_choices() + list_of_fitted_models = [] + + with ProcessPoolExecutor(max_workers=len(list_of_models), mp_context=mp.get_context("spawn")) as executor: + futures = [] + for model in list_of_models: + futures.append(executor.submit(model.fit, train_data)) + [done, _] = wait(futures, return_when=ALL_COMPLETED) + for future in done: + list_of_fitted_models.append(future.result()) + executor.shutdown() + best_model = None + best_metric_value = None + for fitted_model in list_of_fitted_models: + if best_model is None: + best_model = fitted_model + match optimization_metric.value: + case "accuracy": + best_metric_value = fitted_model.accuracy(test_data) + case "precision": + best_metric_value = fitted_model.precision(test_data, positive_class) + case "recall": + best_metric_value = fitted_model.recall(test_data, positive_class) + case "f1_score": + best_metric_value = fitted_model.recall(test_data, positive_class) + else: + match optimization_metric.value: + case "accuracy": + accuracy_of_fitted_model = fitted_model.accuracy(test_data) + if accuracy_of_fitted_model > best_metric_value: + best_model = fitted_model # pragma: no cover + best_metric_value = accuracy_of_fitted_model # pragma: no cover + case "precision": + precision_of_fitted_model = fitted_model.precision(test_data, positive_class) + if precision_of_fitted_model > best_metric_value: + best_model = fitted_model # pragma: no cover + best_metric_value = precision_of_fitted_model # pragma: no cover + case "recall": + recall_of_fitted_model = fitted_model.recall(test_data, positive_class) + if recall_of_fitted_model > best_metric_value: + best_model = fitted_model # pragma: no cover + best_metric_value = recall_of_fitted_model # pragma: no cover + case "f1_score": + f1score_of_fitted_model = fitted_model.f1_score(test_data, positive_class) + if f1score_of_fitted_model > best_metric_value: + best_model = fitted_model # pragma: no cover + best_metric_value = f1score_of_fitted_model # pragma: no cover + assert best_model is not None + return best_model + def _extract_table(table_or_dataset: Table | TabularDataset) -> Table: """Extract the table from the given table or dataset.""" diff --git a/src/safeds/ml/classical/classification/_decision_tree_classifier.py b/src/safeds/ml/classical/classification/_decision_tree_classifier.py index caa20b030..e9a363db9 100644 --- a/src/safeds/ml/classical/classification/_decision_tree_classifier.py +++ b/src/safeds/ml/classical/classification/_decision_tree_classifier.py @@ -4,8 +4,10 @@ from safeds._utils import _structural_hash from safeds.data.image.containers import Image +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError from safeds.exceptions._ml import ModelNotFittedError from safeds.ml.classical._bases import _DecisionTreeBase +from safeds.ml.hyperparameters import Choice from ._classifier import Classifier @@ -39,8 +41,8 @@ class DecisionTreeClassifier(Classifier, _DecisionTreeBase): def __init__( self, *, - max_depth: int | None = None, - min_sample_count_in_leaves: int = 1, + max_depth: int | None | Choice[int | None] = None, + min_sample_count_in_leaves: int | Choice[int] = 1, ) -> None: # Initialize superclasses Classifier.__init__(self) @@ -74,6 +76,28 @@ def _get_sklearn_model(self) -> ClassifierMixin: min_samples_leaf=self._min_sample_count_in_leaves, ) + def _check_additional_fit_preconditions(self) -> None: + if isinstance(self._max_depth, Choice) or isinstance(self._min_sample_count_in_leaves, Choice): + raise FittingWithChoiceError + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if not isinstance(self._max_depth, Choice) and not isinstance(self._min_sample_count_in_leaves, Choice): + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[DecisionTreeClassifier]: + max_depth_choices = self._max_depth if isinstance(self._max_depth, Choice) else [self._max_depth] + min_sample_count_choices = ( + self._min_sample_count_in_leaves + if isinstance(self._min_sample_count_in_leaves, Choice) + else [self._min_sample_count_in_leaves] + ) + + models = [] + for md in max_depth_choices: + for msc in min_sample_count_choices: + models.append(DecisionTreeClassifier(max_depth=md, min_sample_count_in_leaves=msc)) + return models + # ------------------------------------------------------------------------------------------------------------------ # Plot # ------------------------------------------------------------------------------------------------------------------ diff --git a/src/safeds/ml/classical/classification/_gradient_boosting_classifier.py b/src/safeds/ml/classical/classification/_gradient_boosting_classifier.py index f2b78bace..61d733437 100644 --- a/src/safeds/ml/classical/classification/_gradient_boosting_classifier.py +++ b/src/safeds/ml/classical/classification/_gradient_boosting_classifier.py @@ -3,7 +3,9 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError from safeds.ml.classical._bases import _GradientBoostingBase +from safeds.ml.hyperparameters import Choice from ._classifier import Classifier @@ -37,8 +39,8 @@ class GradientBoostingClassifier(Classifier, _GradientBoostingBase): def __init__( self, *, - tree_count: int = 100, - learning_rate: float = 0.1, + tree_count: int | Choice[int] = 100, + learning_rate: float | Choice[float] = 0.1, ) -> None: # Initialize superclasses Classifier.__init__(self) @@ -71,3 +73,23 @@ def _get_sklearn_model(self) -> ClassifierMixin: n_estimators=self._tree_count, learning_rate=self._learning_rate, ) + + def _check_additional_fit_preconditions(self) -> None: + if isinstance(self._tree_count, Choice) or isinstance(self._learning_rate, Choice): + raise FittingWithChoiceError + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if not isinstance(self._tree_count, Choice) and not isinstance(self._learning_rate, Choice): + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[GradientBoostingClassifier]: + tree_count_choices = self._tree_count if isinstance(self._tree_count, Choice) else [self._tree_count] + learning_rate_choices = ( + self._learning_rate if isinstance(self._learning_rate, Choice) else [self._learning_rate] + ) + + models = [] + for tc in tree_count_choices: + for lr in learning_rate_choices: + models.append(GradientBoostingClassifier(tree_count=tc, learning_rate=lr)) + return models diff --git a/src/safeds/ml/classical/classification/_k_nearest_neighbors_classifier.py b/src/safeds/ml/classical/classification/_k_nearest_neighbors_classifier.py index 0dabbffc0..3181802ae 100644 --- a/src/safeds/ml/classical/classification/_k_nearest_neighbors_classifier.py +++ b/src/safeds/ml/classical/classification/_k_nearest_neighbors_classifier.py @@ -3,9 +3,10 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError from safeds.ml.classical._bases import _KNearestNeighborsBase - -from ._classifier import Classifier +from safeds.ml.classical.classification import Classifier +from safeds.ml.hyperparameters import Choice if TYPE_CHECKING: from sklearn.base import ClassifierMixin @@ -35,7 +36,7 @@ class KNearestNeighborsClassifier(Classifier, _KNearestNeighborsBase): def __init__( self, - neighbor_count: int, + neighbor_count: int | Choice[int], ) -> None: # Initialize superclasses Classifier.__init__(self) @@ -54,15 +55,6 @@ def __hash__(self) -> int: # Template methods # ------------------------------------------------------------------------------------------------------------------ - def _check_additional_fit_preconditions(self, training_set: TabularDataset) -> None: - if self._neighbor_count > training_set._table.row_count: - raise ValueError( - ( - f"The parameter 'neighbor_count' ({self._neighbor_count}) has to be less than or equal to" - f" the sample size ({training_set._table.row_count})." - ), - ) - def _clone(self) -> KNearestNeighborsClassifier: return KNearestNeighborsClassifier( neighbor_count=self._neighbor_count, @@ -75,3 +67,25 @@ def _get_sklearn_model(self) -> ClassifierMixin: n_neighbors=self._neighbor_count, n_jobs=-1, ) + + def _check_more_additional_fit_preconditions(self, training_set: TabularDataset) -> None: + if isinstance(self._neighbor_count, Choice): + raise FittingWithChoiceError + if self._neighbor_count > training_set._table.row_count: + raise ValueError( + ( + f"The parameter 'neighbor_count' ({self._neighbor_count}) has to be less than or equal to" + f" the sample size ({training_set._table.row_count})." + ), + ) + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if not isinstance(self._neighbor_count, Choice): + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[KNearestNeighborsClassifier]: + assert isinstance(self._neighbor_count, Choice) # this is always true and just here for linting + models = [] + for nc in self._neighbor_count: + models.append(KNearestNeighborsClassifier(neighbor_count=nc)) + return models diff --git a/src/safeds/ml/classical/classification/_logistic_classifier.py b/src/safeds/ml/classical/classification/_logistic_classifier.py index e00a8d399..d6aa48f56 100644 --- a/src/safeds/ml/classical/classification/_logistic_classifier.py +++ b/src/safeds/ml/classical/classification/_logistic_classifier.py @@ -4,6 +4,7 @@ from safeds._utils import _get_random_seed, _structural_hash from safeds._validation import _check_bounds, _OpenBound +from safeds.exceptions import FittingWithoutChoiceError from ._classifier import Classifier @@ -63,3 +64,6 @@ def _get_sklearn_model(self) -> ClassifierMixin: n_jobs=-1, C=self.c, ) + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + raise FittingWithoutChoiceError diff --git a/src/safeds/ml/classical/classification/_random_forest_classifier.py b/src/safeds/ml/classical/classification/_random_forest_classifier.py index 3603ab292..ea0aef340 100644 --- a/src/safeds/ml/classical/classification/_random_forest_classifier.py +++ b/src/safeds/ml/classical/classification/_random_forest_classifier.py @@ -3,7 +3,9 @@ from typing import TYPE_CHECKING from safeds._utils import _get_random_seed, _structural_hash +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError from safeds.ml.classical._bases import _RandomForestBase +from safeds.ml.hyperparameters import Choice from ._classifier import Classifier @@ -41,9 +43,9 @@ class RandomForestClassifier(Classifier, _RandomForestBase): def __init__( self, *, - tree_count: int = 100, - max_depth: int | None = None, - min_sample_count_in_leaves: int = 1, + tree_count: int | Choice[int] = 100, + max_depth: int | None | Choice[int | None] = None, + min_sample_count_in_leaves: int | Choice[int] = 1, ) -> None: # Initialize superclasses Classifier.__init__(self) @@ -81,3 +83,35 @@ def _get_sklearn_model(self) -> ClassifierMixin: random_state=_get_random_seed(), n_jobs=-1, ) + + def _check_additional_fit_preconditions(self) -> None: + if ( + isinstance(self._tree_count, Choice) + or isinstance(self._max_depth, Choice) + or isinstance(self._min_sample_count_in_leaves, Choice) + ): + raise FittingWithChoiceError + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if ( + not isinstance(self._tree_count, Choice) + and not isinstance(self._max_depth, Choice) + and not isinstance(self._min_sample_count_in_leaves, Choice) + ): + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[RandomForestClassifier]: + tree_count_choices = self._tree_count if isinstance(self._tree_count, Choice) else [self._tree_count] + max_depth_choices = self._max_depth if isinstance(self._max_depth, Choice) else [self._max_depth] + min_sample_count_choices = ( + self._min_sample_count_in_leaves + if isinstance(self._min_sample_count_in_leaves, Choice) + else [self._min_sample_count_in_leaves] + ) + + models = [] + for tc in tree_count_choices: + for md in max_depth_choices: + for msc in min_sample_count_choices: + models.append(RandomForestClassifier(tree_count=tc, max_depth=md, min_sample_count_in_leaves=msc)) + return models diff --git a/src/safeds/ml/classical/classification/_support_vector_classifier.py b/src/safeds/ml/classical/classification/_support_vector_classifier.py index 407c8f97a..03895ce87 100644 --- a/src/safeds/ml/classical/classification/_support_vector_classifier.py +++ b/src/safeds/ml/classical/classification/_support_vector_classifier.py @@ -3,8 +3,10 @@ from typing import TYPE_CHECKING from safeds._utils import _get_random_seed, _structural_hash +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError from safeds.ml.classical._bases import _SupportVectorMachineBase from safeds.ml.classical.classification import Classifier +from safeds.ml.hyperparameters import Choice if TYPE_CHECKING: from sklearn.base import ClassifierMixin @@ -34,8 +36,8 @@ class SupportVectorClassifier(Classifier, _SupportVectorMachineBase): def __init__( self, *, - c: float = 1.0, - kernel: SupportVectorClassifier.Kernel | None = None, + c: float | Choice[float] = 1.0, + kernel: SupportVectorClassifier.Kernel | None | Choice[SupportVectorClassifier.Kernel | None] = None, ) -> None: # Initialize superclasses Classifier.__init__(self) @@ -56,7 +58,7 @@ def __hash__(self) -> int: # ------------------------------------------------------------------------------------------------------------------ @property - def kernel(self) -> SupportVectorClassifier.Kernel: + def kernel(self) -> SupportVectorClassifier.Kernel | Choice[SupportVectorClassifier.Kernel | None]: """The type of kernel used.""" return self._kernel @@ -77,5 +79,25 @@ def _get_sklearn_model(self) -> ClassifierMixin: C=self._c, random_state=_get_random_seed(), ) + assert not isinstance(self._kernel, Choice) self._kernel._apply(result) return result + + def _check_additional_fit_preconditions(self) -> None: + if isinstance(self._c, Choice) or isinstance(self._kernel, Choice): + raise FittingWithChoiceError + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if not isinstance(self._c, Choice) and not isinstance(self._kernel, Choice): + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[SupportVectorClassifier]: + # assert isinstance(self._c, Choice) # this is always true and just here for linting + c_choices = self._c if isinstance(self._c, Choice) else [self._c] + kernel_choices = self.kernel if isinstance(self.kernel, Choice) else [self.kernel] + + models = [] + for c in c_choices: + for kernel in kernel_choices: + models.append(SupportVectorClassifier(c=c, kernel=kernel)) + return models diff --git a/src/safeds/ml/classical/regression/__init__.py b/src/safeds/ml/classical/regression/__init__.py index 1dd3f627a..51dc7bb07 100644 --- a/src/safeds/ml/classical/regression/__init__.py +++ b/src/safeds/ml/classical/regression/__init__.py @@ -9,14 +9,11 @@ from ._arima import ArimaModelRegressor from ._baseline_regressor import BaselineRegressor from ._decision_tree_regressor import DecisionTreeRegressor - from ._elastic_net_regressor import ElasticNetRegressor from ._gradient_boosting_regressor import GradientBoostingRegressor from ._k_nearest_neighbors_regressor import KNearestNeighborsRegressor - from ._lasso_regressor import LassoRegressor from ._linear_regressor import LinearRegressor from ._random_forest_regressor import RandomForestRegressor from ._regressor import Regressor - from ._ridge_regressor import RidgeRegressor from ._support_vector_regressor import SupportVectorRegressor apipkg.initpkg( @@ -26,14 +23,11 @@ "ArimaModelRegressor": "._arima:ArimaModelRegressor", "BaselineRegressor": "._baseline_regressor:BaselineRegressor", "DecisionTreeRegressor": "._decision_tree_regressor:DecisionTreeRegressor", - "ElasticNetRegressor": "._elastic_net_regressor:ElasticNetRegressor", "GradientBoostingRegressor": "._gradient_boosting_regressor:GradientBoostingRegressor", "KNearestNeighborsRegressor": "._k_nearest_neighbors_regressor:KNearestNeighborsRegressor", - "LassoRegressor": "._lasso_regressor:LassoRegressor", "LinearRegressor": "._linear_regressor:LinearRegressor", "RandomForestRegressor": "._random_forest_regressor:RandomForestRegressor", "Regressor": "._regressor:Regressor", - "RidgeRegressor": "._ridge_regressor:RidgeRegressor", "SupportVectorRegressor": "._support_vector_regressor:SupportVectorRegressor", }, ) @@ -43,13 +37,10 @@ "ArimaModelRegressor", "BaselineRegressor", "DecisionTreeRegressor", - "ElasticNetRegressor", "GradientBoostingRegressor", "KNearestNeighborsRegressor", - "LassoRegressor", "LinearRegressor", "RandomForestRegressor", "Regressor", - "RidgeRegressor", "SupportVectorRegressor", ] diff --git a/src/safeds/ml/classical/regression/_ada_boost_regressor.py b/src/safeds/ml/classical/regression/_ada_boost_regressor.py index 25f509229..ef09a7b08 100644 --- a/src/safeds/ml/classical/regression/_ada_boost_regressor.py +++ b/src/safeds/ml/classical/regression/_ada_boost_regressor.py @@ -3,7 +3,9 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError from safeds.ml.classical._bases import _AdaBoostBase +from safeds.ml.hyperparameters import Choice from ._regressor import Regressor @@ -39,9 +41,9 @@ class AdaBoostRegressor(Regressor, _AdaBoostBase): def __init__( self, *, - learner: Regressor | None = None, - max_learner_count: int = 50, - learning_rate: float = 1.0, + learner: Regressor | None | Choice[Regressor | None] = None, + max_learner_count: int | Choice[int] = 50, + learning_rate: float | Choice[float] = 1.0, ) -> None: # Initialize superclasses Regressor.__init__(self) @@ -52,7 +54,7 @@ def __init__( ) # Hyperparameters - self._learner: Regressor | None = learner + self._learner: Regressor | None | Choice[Regressor | None] = learner def __hash__(self) -> int: return _structural_hash( @@ -66,7 +68,7 @@ def __hash__(self) -> int: # ------------------------------------------------------------------------------------------------------------------ @property - def learner(self) -> Regressor | None: + def learner(self) -> Regressor | None | Choice[Regressor | None]: # type: ignore[override] """The base learner used for training the ensemble.""" return self._learner @@ -84,9 +86,42 @@ def _clone(self) -> AdaBoostRegressor: def _get_sklearn_model(self) -> RegressorMixin: from sklearn.ensemble import AdaBoostRegressor as SklearnAdaBoostRegressor + assert not isinstance(self.learner, Choice) learner = self.learner._get_sklearn_model() if self.learner is not None else None return SklearnAdaBoostRegressor( estimator=learner, n_estimators=self._max_learner_count, learning_rate=self._learning_rate, ) + + def _check_additional_fit_preconditions(self) -> None: + if ( + isinstance(self._max_learner_count, Choice) + or isinstance(self._learning_rate, Choice) + or isinstance(self._learner, Choice) + ): + raise FittingWithChoiceError + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if ( + not isinstance(self._max_learner_count, Choice) + and not isinstance(self._learning_rate, Choice) + and not isinstance(self._learner, Choice) + ): + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[AdaBoostRegressor]: + learner_choices = self._learner if isinstance(self._learner, Choice) else [self._learner] + max_learner_count_choices = ( + self._max_learner_count if isinstance(self._max_learner_count, Choice) else [self._max_learner_count] + ) + learning_rate_choices = ( + self._learning_rate if isinstance(self._learning_rate, Choice) else [self._learning_rate] + ) + + models = [] + for learner in learner_choices: + for mlc in max_learner_count_choices: + for lr in learning_rate_choices: + models.append(AdaBoostRegressor(learner=learner, max_learner_count=mlc, learning_rate=lr)) + return models diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index 2efd1b10d..79046c3cc 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -14,13 +14,10 @@ from safeds.ml.classical.regression import ( AdaBoostRegressor, DecisionTreeRegressor, - ElasticNetRegressor, GradientBoostingRegressor, - LassoRegressor, LinearRegressor, RandomForestRegressor, Regressor, - RidgeRegressor, SupportVectorRegressor, ) @@ -52,14 +49,18 @@ def __init__(self, extended_search: bool = False): AdaBoostRegressor(), DecisionTreeRegressor(), LinearRegressor(), + LinearRegressor(LinearRegressor.Penalty.ridge()), RandomForestRegressor(), - RidgeRegressor(), SupportVectorRegressor(), ] if extended_search: self._list_of_model_types.extend( - [ElasticNetRegressor(), LassoRegressor(), GradientBoostingRegressor()], + [ + LinearRegressor(LinearRegressor.Penalty.elastic_net()), + LinearRegressor(LinearRegressor.Penalty.lasso()), + GradientBoostingRegressor(), + ], ) # pragma: no cover self._fitted_models: list[Regressor] = [] diff --git a/src/safeds/ml/classical/regression/_decision_tree_regressor.py b/src/safeds/ml/classical/regression/_decision_tree_regressor.py index 19ea07400..37ee02030 100644 --- a/src/safeds/ml/classical/regression/_decision_tree_regressor.py +++ b/src/safeds/ml/classical/regression/_decision_tree_regressor.py @@ -4,8 +4,10 @@ from safeds._utils import _structural_hash from safeds.data.image.containers import Image +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError from safeds.exceptions._ml import ModelNotFittedError from safeds.ml.classical._bases import _DecisionTreeBase +from safeds.ml.hyperparameters import Choice from ._regressor import Regressor @@ -39,8 +41,8 @@ class DecisionTreeRegressor(Regressor, _DecisionTreeBase): def __init__( self, *, - max_depth: int | None = None, - min_sample_count_in_leaves: int = 5, + max_depth: int | None | Choice[int | None] = None, + min_sample_count_in_leaves: int | Choice[int] = 5, ) -> None: # Initialize superclasses Regressor.__init__(self) @@ -74,6 +76,28 @@ def _get_sklearn_model(self) -> RegressorMixin: min_samples_leaf=self._min_sample_count_in_leaves, ) + def _check_additional_fit_preconditions(self) -> None: + if isinstance(self._max_depth, Choice) or isinstance(self._min_sample_count_in_leaves, Choice): + raise FittingWithChoiceError + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if not isinstance(self._max_depth, Choice) and not isinstance(self._min_sample_count_in_leaves, Choice): + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[DecisionTreeRegressor]: + max_depth_choices = self._max_depth if isinstance(self._max_depth, Choice) else [self._max_depth] + min_sample_count_choices = ( + self._min_sample_count_in_leaves + if isinstance(self._min_sample_count_in_leaves, Choice) + else [self._min_sample_count_in_leaves] + ) + + models = [] + for md in max_depth_choices: + for msc in min_sample_count_choices: + models.append(DecisionTreeRegressor(max_depth=md, min_sample_count_in_leaves=msc)) + return models + # ------------------------------------------------------------------------------------------------------------------ # Plot # ------------------------------------------------------------------------------------------------------------------ diff --git a/src/safeds/ml/classical/regression/_elastic_net_regressor.py b/src/safeds/ml/classical/regression/_elastic_net_regressor.py deleted file mode 100644 index 0e0fedc64..000000000 --- a/src/safeds/ml/classical/regression/_elastic_net_regressor.py +++ /dev/null @@ -1,111 +0,0 @@ -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING -from warnings import warn - -from safeds._utils import _structural_hash -from safeds._validation import _check_bounds, _ClosedBound - -from ._regressor import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - - -class ElasticNetRegressor(Regressor): - """Elastic net regression. - - Parameters - ---------- - alpha: - Controls the regularization of the model. The higher the value, the more regularized it becomes. - lasso_ratio: - Number between 0 and 1 that controls the ratio between Lasso and Ridge regularization. If 0, only Ridge - regularization is used. If 1, only Lasso regularization is used. - - Raises - ------ - OutOfBoundsError - If `alpha` is negative or `lasso_ratio` is not between 0 and 1. - """ - - # ------------------------------------------------------------------------------------------------------------------ - # Dunder methods - # ------------------------------------------------------------------------------------------------------------------ - - def __init__(self, *, alpha: float = 1.0, lasso_ratio: float = 0.5) -> None: - super().__init__() - - # Validation - _check_bounds("alpha", alpha, lower_bound=_ClosedBound(0)) - if alpha == 0: - warn( - ( - "Setting alpha to zero makes this model equivalent to LinearRegression. You should use " - "LinearRegression instead for better numerical stability." - ), - UserWarning, - stacklevel=2, - ) - - _check_bounds("lasso_ratio", lasso_ratio, lower_bound=_ClosedBound(0), upper_bound=_ClosedBound(1)) - if lasso_ratio == 0: - warnings.warn( - ( - "ElasticNetRegression with lasso_ratio = 0 is essentially RidgeRegression." - " Use RidgeRegression instead for better numerical stability." - ), - stacklevel=2, - ) - elif lasso_ratio == 1: - warnings.warn( - ( - "ElasticNetRegression with lasso_ratio = 0 is essentially LassoRegression." - " Use LassoRegression instead for better numerical stability." - ), - stacklevel=2, - ) - - # Hyperparameters - self._alpha = alpha - self._lasso_ratio = lasso_ratio - - def __hash__(self) -> int: - return _structural_hash( - super().__hash__(), - self._alpha, - self._lasso_ratio, - ) - - # ------------------------------------------------------------------------------------------------------------------ - # Properties - # ------------------------------------------------------------------------------------------------------------------ - - @property - def alpha(self) -> float: - """The regularization of the model.""" - return self._alpha - - @property - def lasso_ratio(self) -> float: - """Rhe ratio between Lasso and Ridge regularization.""" - return self._lasso_ratio - - # ------------------------------------------------------------------------------------------------------------------ - # Template methods - # ------------------------------------------------------------------------------------------------------------------ - - def _clone(self) -> ElasticNetRegressor: - return ElasticNetRegressor( - alpha=self._alpha, - lasso_ratio=self._lasso_ratio, - ) - - def _get_sklearn_model(self) -> RegressorMixin: - from sklearn.linear_model import ElasticNet as SklearnElasticNet - - return SklearnElasticNet( - alpha=self._alpha, - l1_ratio=self._lasso_ratio, - ) diff --git a/src/safeds/ml/classical/regression/_gradient_boosting_regressor.py b/src/safeds/ml/classical/regression/_gradient_boosting_regressor.py index deef0971b..50ee2214f 100644 --- a/src/safeds/ml/classical/regression/_gradient_boosting_regressor.py +++ b/src/safeds/ml/classical/regression/_gradient_boosting_regressor.py @@ -3,7 +3,9 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError from safeds.ml.classical._bases import _GradientBoostingBase +from safeds.ml.hyperparameters import Choice from ._regressor import Regressor @@ -37,8 +39,8 @@ class GradientBoostingRegressor(Regressor, _GradientBoostingBase): def __init__( self, *, - tree_count: int = 100, - learning_rate: float = 0.1, + tree_count: int | Choice[int] = 100, + learning_rate: float | Choice[float] = 0.1, ) -> None: # Initialize superclasses Regressor.__init__(self) @@ -71,3 +73,23 @@ def _get_sklearn_model(self) -> RegressorMixin: n_estimators=self._tree_count, learning_rate=self._learning_rate, ) + + def _check_additional_fit_preconditions(self) -> None: + if isinstance(self._tree_count, Choice) or isinstance(self._learning_rate, Choice): + raise FittingWithChoiceError + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if not isinstance(self._tree_count, Choice) and not isinstance(self._learning_rate, Choice): + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[GradientBoostingRegressor]: + tree_count_choices = self._tree_count if isinstance(self._tree_count, Choice) else [self._tree_count] + learning_rate_choices = ( + self._learning_rate if isinstance(self._learning_rate, Choice) else [self._learning_rate] + ) + + models = [] + for tc in tree_count_choices: + for lr in learning_rate_choices: + models.append(GradientBoostingRegressor(tree_count=tc, learning_rate=lr)) + return models diff --git a/src/safeds/ml/classical/regression/_k_nearest_neighbors_regressor.py b/src/safeds/ml/classical/regression/_k_nearest_neighbors_regressor.py index 2766fdcbb..d999996b6 100644 --- a/src/safeds/ml/classical/regression/_k_nearest_neighbors_regressor.py +++ b/src/safeds/ml/classical/regression/_k_nearest_neighbors_regressor.py @@ -3,7 +3,9 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError from safeds.ml.classical._bases import _KNearestNeighborsBase +from safeds.ml.hyperparameters import Choice from ._regressor import Regressor @@ -33,7 +35,7 @@ class KNearestNeighborsRegressor(Regressor, _KNearestNeighborsBase): # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, neighbor_count: int) -> None: + def __init__(self, neighbor_count: int | Choice[int]) -> None: # Initialize superclasses Regressor.__init__(self) _KNearestNeighborsBase.__init__( @@ -51,15 +53,6 @@ def __hash__(self) -> int: # Template methods # ------------------------------------------------------------------------------------------------------------------ - def _check_additional_fit_preconditions(self, training_set: TabularDataset) -> None: - if self._neighbor_count > training_set.to_table().row_count: - raise ValueError( - ( - f"The parameter 'neighbor_count' ({self._neighbor_count}) has to be less than or equal to" - f" the sample size ({training_set.to_table().row_count})." - ), - ) - def _clone(self) -> KNearestNeighborsRegressor: return KNearestNeighborsRegressor( neighbor_count=self._neighbor_count, @@ -72,3 +65,25 @@ def _get_sklearn_model(self) -> RegressorMixin: n_neighbors=self._neighbor_count, n_jobs=-1, ) + + def _check_more_additional_fit_preconditions(self, training_set: TabularDataset) -> None: + if isinstance(self._neighbor_count, Choice): + raise FittingWithChoiceError + if self._neighbor_count > training_set._table.row_count: + raise ValueError( + ( + f"The parameter 'neighbor_count' ({self._neighbor_count}) has to be less than or equal to" + f" the sample size ({training_set._table.row_count})." + ), + ) + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if not isinstance(self._neighbor_count, Choice): + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[KNearestNeighborsRegressor]: + assert isinstance(self._neighbor_count, Choice) # this is always true and just here for linting + models = [] + for nc in self._neighbor_count: + models.append(KNearestNeighborsRegressor(neighbor_count=nc)) + return models diff --git a/src/safeds/ml/classical/regression/_lasso_regressor.py b/src/safeds/ml/classical/regression/_lasso_regressor.py deleted file mode 100644 index f9cae7daa..000000000 --- a/src/safeds/ml/classical/regression/_lasso_regressor.py +++ /dev/null @@ -1,85 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING -from warnings import warn - -from safeds._utils import _structural_hash -from safeds._validation import _check_bounds, _ClosedBound - -from ._regressor import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - - -class LassoRegressor(Regressor): - """Lasso regression. - - Parameters - ---------- - alpha: - Controls the regularization of the model. The higher the value, the more regularized it becomes. - - Raises - ------ - OutOfBoundsError - If `alpha` is negative. - """ - - # ------------------------------------------------------------------------------------------------------------------ - # Dunder methods - # ------------------------------------------------------------------------------------------------------------------ - - def __init__(self, *, alpha: float = 1.0) -> None: - super().__init__() - - # Validation - _check_bounds("alpha", alpha, lower_bound=_ClosedBound(0)) - if alpha == 0: - warn( - ( - "Setting alpha to zero makes this model equivalent to LinearRegression. You should use " - "LinearRegression instead for better numerical stability." - ), - UserWarning, - stacklevel=2, - ) - - # Hyperparameters - self._alpha = alpha - - def __hash__(self) -> int: - return _structural_hash( - super().__hash__(), - self._alpha, - ) - - # ------------------------------------------------------------------------------------------------------------------ - # Properties - # ------------------------------------------------------------------------------------------------------------------ - - @property - def alpha(self) -> float: - """ - Get the regularization of the model. - - Returns - ------- - result: - The regularization of the model. - """ - return self._alpha - - # ------------------------------------------------------------------------------------------------------------------ - # Template methods - # ------------------------------------------------------------------------------------------------------------------ - - def _clone(self) -> LassoRegressor: - return LassoRegressor( - alpha=self._alpha, - ) - - def _get_sklearn_model(self) -> RegressorMixin: - from sklearn.linear_model import Lasso as SklearnLasso - - return SklearnLasso(alpha=self._alpha) diff --git a/src/safeds/ml/classical/regression/_linear_regressor.py b/src/safeds/ml/classical/regression/_linear_regressor.py index 8a61d13fd..f6ada488e 100644 --- a/src/safeds/ml/classical/regression/_linear_regressor.py +++ b/src/safeds/ml/classical/regression/_linear_regressor.py @@ -1,8 +1,18 @@ from __future__ import annotations +import sys +from abc import ABC, abstractmethod from typing import TYPE_CHECKING +from sklearn.linear_model import ElasticNet as SklearnElasticNet +from sklearn.linear_model import Lasso as SklearnLasso +from sklearn.linear_model import LinearRegression as SklearnLinear +from sklearn.linear_model import Ridge as SklearnRidge + from safeds._utils import _structural_hash +from safeds._validation import _check_bounds, _ClosedBound +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError +from safeds.ml.hyperparameters import Choice from ._regressor import Regressor @@ -11,28 +21,353 @@ class LinearRegressor(Regressor): - """Linear regression.""" + """ + Linear regression. + + Parameters + ---------- + penalty: + The type of penalty to be used. Defaults to a simple linear regression. + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Inner classes + # ------------------------------------------------------------------------------------------------------------------ + + class Penalty(ABC): + """ + Possible penalties for the linear regressor. + + Use the static factory methods to create instances of this class. + """ + + @abstractmethod + def __eq__(self, other: object) -> bool: ... + + @abstractmethod + def __hash__(self) -> int: ... + + @abstractmethod + def __str__(self) -> str: ... + + @abstractmethod + def _get_sklearn_model(self) -> RegressorMixin: + """Get the model of a penalty.""" + + @abstractmethod + def _get_models_for_all_choices(self) -> list[LinearRegressor]: + """Get a list of all possible models, given the choices.""" + + @abstractmethod + def _contains_choice_parameters(self) -> bool: + """Return if any parameters of this penalty are choice instances.""" + + @staticmethod + def linear() -> LinearRegressor.Penalty: + """Create a linear penalty.""" + raise NotImplementedError # pragma: no cover + + @staticmethod + def ridge(alpha: float | Choice[float] = 1.0) -> LinearRegressor.Penalty: + """Create a ridge penalty.""" + raise NotImplementedError # pragma: no cover + + @staticmethod + def lasso(alpha: float | Choice[float] = 1.0) -> LinearRegressor.Penalty: + """Create a lasso penalty.""" + raise NotImplementedError # pragma: no cover + + @staticmethod + def elastic_net( + alpha: float | Choice[float] = 1.0, + lasso_ratio: float | Choice[float] = 0.5, + ) -> LinearRegressor.Penalty: + """Create an elastic net penalty.""" + raise NotImplementedError # pragma: no cover # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self) -> None: - super().__init__() + def __init__(self, penalty: LinearRegressor.Penalty | None | Choice[LinearRegressor.Penalty | None] = None) -> None: + Regressor.__init__(self) + if penalty is None: + penalty = LinearRegressor.Penalty.linear() + + # Hyperparameters + self._penalty: LinearRegressor.Penalty | Choice[LinearRegressor.Penalty | None] = penalty def __hash__(self) -> int: return _structural_hash( super().__hash__(), + self._penalty, ) # ------------------------------------------------------------------------------------------------------------------ - # Template methods + # Properties # ------------------------------------------------------------------------------------------------------------------ + @property + def penalty(self) -> LinearRegressor.Penalty | Choice[LinearRegressor.Penalty | None]: + """The regularization of the model.""" + return self._penalty + def _clone(self) -> LinearRegressor: - return LinearRegressor() + return LinearRegressor(penalty=self._penalty) def _get_sklearn_model(self) -> RegressorMixin: - from sklearn.linear_model import LinearRegression as sk_LinearRegression + assert not isinstance(self.penalty, Choice) + return self.penalty._get_sklearn_model() + + def _check_additional_fit_preconditions(self) -> None: + if isinstance(self._penalty, Choice) or self.penalty._contains_choice_parameters(): # type: ignore[union-attr] + raise FittingWithChoiceError + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if not isinstance(self._penalty, Choice) and not self.penalty._contains_choice_parameters(): # type: ignore[union-attr] + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[LinearRegressor]: + penalty_choices = self._penalty if isinstance(self._penalty, Choice) else [self._penalty] + + models = [] + for pen in penalty_choices: + if pen is None: + models.append(LinearRegressor()) + elif pen._contains_choice_parameters(): + models.extend(pen._get_models_for_all_choices()) + else: + models.append(LinearRegressor(penalty=pen)) + return models + + +# ---------------------------------------------------------------------------------------------------------------------- +# Kernels +# ---------------------------------------------------------------------------------------------------------------------- + + +class _Linear(LinearRegressor.Penalty): + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Linear): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash( + self.__class__.__qualname__, + ) + + def __str__(self) -> str: + return "Linear" + + def _contains_choice_parameters(self) -> bool: + return False + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _get_sklearn_model(self) -> SklearnLinear: + return SklearnLinear(n_jobs=-1) + + def _get_models_for_all_choices(self) -> list[LinearRegressor]: + raise NotImplementedError # pragma: no cover + + +class _Ridge(LinearRegressor.Penalty): + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__(self, alpha: float | Choice[float] = 1.0): + # Validation + if isinstance(alpha, Choice): + for a in alpha: + _check_bounds("alpha", a, lower_bound=_ClosedBound(0)) + else: + _check_bounds("alpha", alpha, lower_bound=_ClosedBound(0)) + + self._alpha = alpha + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Ridge): + return NotImplemented + return self._alpha == other._alpha + + def __hash__(self) -> int: + return _structural_hash( + self.__class__.__qualname__, + self._alpha, + ) + + def __sizeof__(self) -> int: + return sys.getsizeof(self._alpha) + + def __str__(self) -> str: + return f"Ridge(alpha={self._alpha})" + + def _contains_choice_parameters(self) -> bool: + return isinstance(self._alpha, Choice) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def alpha(self) -> float | Choice[float]: + """The regularization of the linear penalty.""" + return self._alpha + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _get_sklearn_model(self) -> SklearnRidge: + return SklearnRidge(alpha=self._alpha) + + def _get_models_for_all_choices(self) -> list[LinearRegressor]: + assert isinstance(self._alpha, Choice) + models = [] + for alpha in self._alpha: + models.append(LinearRegressor(penalty=LinearRegressor.Penalty.ridge(alpha=alpha))) + return models + + +class _Lasso(LinearRegressor.Penalty): + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + def __init__(self, alpha: float | Choice[float] = 1.0): + # Validation + if isinstance(alpha, Choice): + for a in alpha: + _check_bounds("alpha", a, lower_bound=_ClosedBound(0)) + else: + _check_bounds("alpha", alpha, lower_bound=_ClosedBound(0)) + + self._alpha = alpha + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Lasso): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash( + self.__class__.__qualname__, + self._alpha, + ) + + def __str__(self) -> str: + return f"Lasso(alpha={self._alpha})" + + def _contains_choice_parameters(self) -> bool: + return isinstance(self._alpha, Choice) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def alpha(self) -> float | Choice[float]: + """The regularization of the linear penalty.""" + return self._alpha + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _get_sklearn_model(self) -> SklearnLasso: + return SklearnLasso(alpha=self._alpha) + + def _get_models_for_all_choices(self) -> list[LinearRegressor]: + assert isinstance(self._alpha, Choice) + models = [] + for alpha in self._alpha: + models.append(LinearRegressor(penalty=LinearRegressor.Penalty.lasso(alpha=alpha))) + return models + + +class _ElasticNet(LinearRegressor.Penalty): + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + def __init__(self, alpha: float | Choice[float] = 1.0, lasso_ratio: float | Choice[float] = 0.5): + # Validation + if isinstance(alpha, Choice): + for a in alpha: + _check_bounds("alpha", a, lower_bound=_ClosedBound(0)) + else: + _check_bounds("alpha", alpha, lower_bound=_ClosedBound(0)) + + if isinstance(lasso_ratio, Choice): + for lr in lasso_ratio: + _check_bounds("lasso_ratio", lr, lower_bound=_ClosedBound(0), upper_bound=_ClosedBound(1)) + else: + _check_bounds("lasso_ratio", lasso_ratio, lower_bound=_ClosedBound(0), upper_bound=_ClosedBound(1)) + + self._alpha = alpha + self._lasso_ratio = lasso_ratio + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _ElasticNet): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash( + self.__class__.__qualname__, + self._alpha, + self._lasso_ratio, + ) + + def __str__(self) -> str: + return f"ElasticNet(alpha={self._alpha}, lasso_ratio={self._lasso_ratio})" + + def _contains_choice_parameters(self) -> bool: + return isinstance(self._alpha, Choice) or isinstance(self._lasso_ratio, Choice) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def alpha(self) -> float | Choice[float]: + """The regularization of the linear penalty.""" + return self._alpha + + @property + def lasso_ratio(self) -> float | Choice[float]: + """The regularization of the linear penalty.""" + return self._lasso_ratio + + # ------------------------------------------------------------------------------------------------------------------ + # Template methods + # ------------------------------------------------------------------------------------------------------------------ + + def _get_sklearn_model(self) -> SklearnElasticNet: + return SklearnElasticNet(alpha=self._alpha, l1_ratio=self._lasso_ratio) + + def _get_models_for_all_choices(self) -> list[LinearRegressor]: + alpha_choices = self._alpha if isinstance(self._alpha, Choice) else [self._alpha] + lasso_choices = self._lasso_ratio if isinstance(self._lasso_ratio, Choice) else [self._lasso_ratio] + + models = [] + for alpha in alpha_choices: + for lasso in lasso_choices: + models.append( + LinearRegressor(penalty=LinearRegressor.Penalty.elastic_net(alpha=alpha, lasso_ratio=lasso)), + ) + return models + - return sk_LinearRegression(n_jobs=-1) +# Override the methods with classes, so they can be used in `isinstance` calls. Unlike methods, classes define a type. +# This is needed for the DSL, where LinearRegressor penalties are variants of an enum. +LinearRegressor.Penalty.linear = _Linear # type: ignore[method-assign] +LinearRegressor.Penalty.ridge = _Ridge # type: ignore[method-assign] +LinearRegressor.Penalty.lasso = _Lasso # type: ignore[method-assign] +LinearRegressor.Penalty.elastic_net = _ElasticNet # type: ignore[method-assign] diff --git a/src/safeds/ml/classical/regression/_random_forest_regressor.py b/src/safeds/ml/classical/regression/_random_forest_regressor.py index 711f12edb..4de218521 100644 --- a/src/safeds/ml/classical/regression/_random_forest_regressor.py +++ b/src/safeds/ml/classical/regression/_random_forest_regressor.py @@ -3,7 +3,9 @@ from typing import TYPE_CHECKING from safeds._utils import _get_random_seed, _structural_hash +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError from safeds.ml.classical._bases import _RandomForestBase +from safeds.ml.hyperparameters import Choice from ._regressor import Regressor @@ -41,9 +43,9 @@ class RandomForestRegressor(Regressor, _RandomForestBase): def __init__( self, *, - tree_count: int = 100, - max_depth: int | None = None, - min_sample_count_in_leaves: int = 5, + tree_count: int | Choice[int] = 100, + max_depth: int | None | Choice[int | None] = None, + min_sample_count_in_leaves: int | Choice[int] = 5, ) -> None: # Initialize superclasses Regressor.__init__(self) @@ -81,3 +83,35 @@ def _get_sklearn_model(self) -> RegressorMixin: random_state=_get_random_seed(), n_jobs=-1, ) + + def _check_additional_fit_preconditions(self) -> None: + if ( + isinstance(self._tree_count, Choice) + or isinstance(self._max_depth, Choice) + or isinstance(self._min_sample_count_in_leaves, Choice) + ): + raise FittingWithChoiceError + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if ( + not isinstance(self._tree_count, Choice) + and not isinstance(self._max_depth, Choice) + and not isinstance(self._min_sample_count_in_leaves, Choice) + ): + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[RandomForestRegressor]: + tree_count_choices = self._tree_count if isinstance(self._tree_count, Choice) else [self._tree_count] + max_depth_choices = self._max_depth if isinstance(self._max_depth, Choice) else [self._max_depth] + min_sample_count_choices = ( + self._min_sample_count_in_leaves + if isinstance(self._min_sample_count_in_leaves, Choice) + else [self._min_sample_count_in_leaves] + ) + + models = [] + for tc in tree_count_choices: + for md in max_depth_choices: + for msc in min_sample_count_choices: + models.append(RandomForestRegressor(tree_count=tc, max_depth=md, min_sample_count_in_leaves=msc)) + return models diff --git a/src/safeds/ml/classical/regression/_regressor.py b/src/safeds/ml/classical/regression/_regressor.py index ce08bd506..b74795002 100644 --- a/src/safeds/ml/classical/regression/_regressor.py +++ b/src/safeds/ml/classical/regression/_regressor.py @@ -1,12 +1,19 @@ from __future__ import annotations from abc import ABC -from typing import TYPE_CHECKING +from concurrent.futures import ALL_COMPLETED, ProcessPoolExecutor, wait +from typing import TYPE_CHECKING, Self + +from joblib._multiprocessing_helpers import mp from safeds.data.labeled.containers import TabularDataset -from safeds.exceptions import ColumnLengthMismatchError, ModelNotFittedError +from safeds.exceptions import ( + ColumnLengthMismatchError, + DatasetMissesDataError, + ModelNotFittedError, +) from safeds.ml.classical import SupervisedModel -from safeds.ml.metrics import RegressionMetrics +from safeds.ml.metrics import RegressionMetrics, RegressorMetric if TYPE_CHECKING: from safeds.data.tabular.containers import Column, Table @@ -244,6 +251,101 @@ def median_absolute_deviation(self, validation_or_test_set: Table | TabularDatas validation_or_test_set.get_column(self.get_target_name()), ) + def fit_by_exhaustive_search(self, training_set: TabularDataset, optimization_metric: RegressorMetric) -> Self: + """ + Use the hyperparameter choices to create multiple models and fit them. + + **Note:** This model is not modified. + + Parameters + ---------- + training_set: + The training data containing the features and target. + optimization_metric: + The metric that should be used for determining the performance of a model. + + Returns + ------- + best_model: + The model that performed the best out of all possible models given the Choices of hyperparameters. + + Raises + ------ + PlainTableError + If a table is passed instead of a TabularDataset. + DatasetMissesDataError + If the given training set contains no data. + FittingWithoutChoiceError + When trying to call this method on a model without hyperparameter choices. + LearningError + If the training data contains invalid values or if the training failed. + """ + if training_set.to_table().row_count == 0: + raise DatasetMissesDataError + + self._check_additional_fit_by_exhaustive_search_preconditions() + + [train_split, test_split] = training_set.to_table().split_rows(0.75) + train_data = train_split.to_tabular_dataset( + target_name=training_set.target.name, + extra_names=training_set.extras.column_names, + ) + test_data = test_split.to_tabular_dataset( + target_name=training_set.target.name, + extra_names=training_set.extras.column_names, + ) + + list_of_models = self._get_models_for_all_choices() + list_of_fitted_models = [] + + with ProcessPoolExecutor(max_workers=len(list_of_models), mp_context=mp.get_context("spawn")) as executor: + futures = [] + for model in list_of_models: + futures.append(executor.submit(model.fit, train_data)) + [done, _] = wait(futures, return_when=ALL_COMPLETED) + for future in done: + list_of_fitted_models.append(future.result()) + executor.shutdown() + + best_model = None + best_metric_value = None + for fitted_model in list_of_fitted_models: + if best_model is None: + best_model = fitted_model + match optimization_metric.value: + case "mean_squared_error": + best_metric_value = fitted_model.mean_squared_error(test_data) + case "mean_absolute_error": + best_metric_value = fitted_model.mean_absolute_error(test_data) + case "median_absolute_deviation": + best_metric_value = fitted_model.median_absolute_deviation(test_data) + case "coefficient_of_determination": + best_metric_value = fitted_model.coefficient_of_determination(test_data) + else: + match optimization_metric.value: + case "mean_squared_error": + error_of_fitted_model = fitted_model.mean_squared_error(test_data) + if error_of_fitted_model < best_metric_value: + best_model = fitted_model # pragma: no cover + best_metric_value = error_of_fitted_model # pragma: no cover + case "mean_absolute_error": + error_of_fitted_model = fitted_model.mean_absolute_error(test_data) + if error_of_fitted_model < best_metric_value: + best_model = fitted_model # pragma: no cover + best_metric_value = error_of_fitted_model # pragma: no cover + case "median_absolute_deviation": + error_of_fitted_model = fitted_model.median_absolute_deviation(test_data) + if error_of_fitted_model < best_metric_value: + best_model = fitted_model # pragma: no cover + best_metric_value = error_of_fitted_model # pragma: no cover + case "coefficient_of_determination": + error_of_fitted_model = fitted_model.coefficient_of_determination(test_data) + if error_of_fitted_model > best_metric_value: + best_model = fitted_model # pragma: no cover + best_metric_value = error_of_fitted_model # pragma: no cover + assert best_model is not None + return best_model + def _check_metrics_preconditions(actual: Column, expected: Column) -> None: # pragma: no cover if not actual.type.is_numeric: diff --git a/src/safeds/ml/classical/regression/_ridge_regressor.py b/src/safeds/ml/classical/regression/_ridge_regressor.py deleted file mode 100644 index d6226793d..000000000 --- a/src/safeds/ml/classical/regression/_ridge_regressor.py +++ /dev/null @@ -1,84 +0,0 @@ -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING - -from safeds._utils import _structural_hash -from safeds._validation import _check_bounds, _ClosedBound - -from ._regressor import Regressor - -if TYPE_CHECKING: - from sklearn.base import RegressorMixin - - -class RidgeRegressor(Regressor): - """ - Ridge regression. - - Parameters - ---------- - alpha: - Controls the regularization of the model. The higher the value, the more regularized it becomes. - - Raises - ------ - OutOfBoundsError - If `alpha` is negative. - """ - - # ------------------------------------------------------------------------------------------------------------------ - # Dunder methods - # ------------------------------------------------------------------------------------------------------------------ - - def __init__(self, *, alpha: float = 1.0) -> None: - super().__init__() - - # Validation - _check_bounds("alpha", alpha, lower_bound=_ClosedBound(0)) - if alpha == 0.0: - warnings.warn( - ( - "Setting alpha to zero makes this model equivalent to LinearRegression. You should use " - "LinearRegression instead for better numerical stability." - ), - UserWarning, - stacklevel=2, - ) - - # Hyperparameters - self._alpha = alpha - - def __hash__(self) -> int: - return _structural_hash( - super().__hash__(), - self._alpha, - ) - - # ------------------------------------------------------------------------------------------------------------------ - # Properties - # ------------------------------------------------------------------------------------------------------------------ - - @property - def alpha(self) -> float: - """ - Get the regularization of the model. - - Returns - ------- - result: - The regularization of the model. - """ - return self._alpha - - # ------------------------------------------------------------------------------------------------------------------ - # Template methods - # ------------------------------------------------------------------------------------------------------------------ - - def _clone(self) -> RidgeRegressor: - return RidgeRegressor(alpha=self._alpha) - - def _get_sklearn_model(self) -> RegressorMixin: - from sklearn.linear_model import Ridge as SklearnRidge - - return SklearnRidge(alpha=self._alpha) diff --git a/src/safeds/ml/classical/regression/_support_vector_regressor.py b/src/safeds/ml/classical/regression/_support_vector_regressor.py index 03c08d664..24ab4196b 100644 --- a/src/safeds/ml/classical/regression/_support_vector_regressor.py +++ b/src/safeds/ml/classical/regression/_support_vector_regressor.py @@ -3,8 +3,10 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash +from safeds.exceptions import FittingWithChoiceError, FittingWithoutChoiceError from safeds.ml.classical._bases import _SupportVectorMachineBase from safeds.ml.classical.regression import Regressor +from safeds.ml.hyperparameters import Choice if TYPE_CHECKING: from sklearn.base import RegressorMixin @@ -34,8 +36,8 @@ class SupportVectorRegressor(Regressor, _SupportVectorMachineBase): def __init__( self, *, - c: float = 1.0, - kernel: SupportVectorRegressor.Kernel | None = None, + c: float | Choice[float] = 1.0, + kernel: SupportVectorRegressor.Kernel | None | Choice[SupportVectorRegressor.Kernel | None] = None, ) -> None: # Initialize superclasses Regressor.__init__(self) @@ -56,7 +58,7 @@ def __hash__(self) -> int: # ------------------------------------------------------------------------------------------------------------------ @property - def kernel(self) -> SupportVectorRegressor.Kernel: + def kernel(self) -> SupportVectorRegressor.Kernel | Choice[SupportVectorRegressor.Kernel | None]: """The type of kernel used.""" return self._kernel @@ -76,5 +78,25 @@ def _get_sklearn_model(self) -> RegressorMixin: result = SklearnSVR( C=self._c, ) + assert not isinstance(self._kernel, Choice) self._kernel._apply(result) return result + + def _check_additional_fit_preconditions(self) -> None: + if isinstance(self._c, Choice) or isinstance(self.kernel, Choice): + raise FittingWithChoiceError + + def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: + if not isinstance(self._c, Choice) and not isinstance(self.kernel, Choice): + raise FittingWithoutChoiceError + + def _get_models_for_all_choices(self) -> list[SupportVectorRegressor]: + # assert isinstance(self._c, Choice) # this is always true and just here for linting + c_choices = self._c if isinstance(self._c, Choice) else [self._c] + kernel_choices = self.kernel if isinstance(self.kernel, Choice) else [self.kernel] + + models = [] + for c in c_choices: + for kernel in kernel_choices: + models.append(SupportVectorRegressor(c=c, kernel=kernel)) + return models diff --git a/src/safeds/ml/hyperparameters/_choice.py b/src/safeds/ml/hyperparameters/_choice.py index 09530ded2..3f7332a4c 100644 --- a/src/safeds/ml/hyperparameters/_choice.py +++ b/src/safeds/ml/hyperparameters/_choice.py @@ -19,12 +19,13 @@ def __init__(self, *args: T) -> None: """ Create a new choice. Duplicate values will be removed. + Duplicate values will be removed. + Parameters ---------- *args: The values to choose from. """ - self.elements = list(args) if len(args) < 1: raise EmptyChoiceError self.elements = list(dict.fromkeys(args)) diff --git a/src/safeds/ml/metrics/__init__.py b/src/safeds/ml/metrics/__init__.py index aa465cff0..e430ca7c2 100644 --- a/src/safeds/ml/metrics/__init__.py +++ b/src/safeds/ml/metrics/__init__.py @@ -6,17 +6,23 @@ if TYPE_CHECKING: from ._classification_metrics import ClassificationMetrics + from ._classifier_metric import ClassifierMetric from ._regression_metrics import RegressionMetrics + from ._regressor_metric import RegressorMetric apipkg.initpkg( __name__, { "ClassificationMetrics": "._classification_metrics:ClassificationMetrics", "RegressionMetrics": "._regression_metrics:RegressionMetrics", + "RegressorMetric": "._regressor_metric:RegressorMetric", + "ClassifierMetric": "._classifier_metric:ClassifierMetric", }, ) __all__ = [ + "ClassifierMetric", "ClassificationMetrics", + "RegressorMetric", "RegressionMetrics", ] diff --git a/src/safeds/ml/metrics/_classifier_metric.py b/src/safeds/ml/metrics/_classifier_metric.py new file mode 100644 index 000000000..4f69c2607 --- /dev/null +++ b/src/safeds/ml/metrics/_classifier_metric.py @@ -0,0 +1,10 @@ +from enum import Enum + + +class ClassifierMetric(Enum): + """An Enum of possible Metrics for a Classifier.""" + + ACCURACY = "accuracy" + PRECISION = "precision" + RECALL = "recall" + F1_SCORE = "f1_score" diff --git a/src/safeds/ml/metrics/_regressor_metric.py b/src/safeds/ml/metrics/_regressor_metric.py new file mode 100644 index 000000000..421ce4b08 --- /dev/null +++ b/src/safeds/ml/metrics/_regressor_metric.py @@ -0,0 +1,10 @@ +from enum import Enum + + +class RegressorMetric(Enum): + """An Enum of possible Metrics for a Regressor.""" + + MEAN_SQUARED_ERROR = "mean_squared_error" + MEAN_ABSOLUTE_ERROR = "mean_absolute_error" + MEDIAN_ABSOLUTE_DEVIATION = "median_absolute_deviation" + COEFFICIENT_OF_DETERMINATION = "coefficient_of_determination" diff --git a/tests/safeds/ml/classical/classification/test_ada_boost.py b/tests/safeds/ml/classical/classification/test_ada_boost.py index ecfe6f3f4..28e44851b 100644 --- a/tests/safeds/ml/classical/classification/test_ada_boost.py +++ b/tests/safeds/ml/classical/classification/test_ada_boost.py @@ -3,6 +3,7 @@ from safeds.data.tabular.containers import Table from safeds.exceptions import OutOfBoundsError from safeds.ml.classical.classification import AdaBoostClassifier +from safeds.ml.hyperparameters import Choice @pytest.fixture() @@ -34,8 +35,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.n_estimators == 2 - @pytest.mark.parametrize("max_learner_count", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, max_learner_count: int) -> None: + @pytest.mark.parametrize("max_learner_count", [-1, 0, Choice(-1)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, max_learner_count: int | Choice[int]) -> None: with pytest.raises(OutOfBoundsError): AdaBoostClassifier(max_learner_count=max_learner_count) @@ -50,7 +51,7 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.learning_rate == 2 - @pytest.mark.parametrize("learning_rate", [-1.0, 0.0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, learning_rate: float) -> None: + @pytest.mark.parametrize("learning_rate", [-1.0, 0.0, Choice(-1.0)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, learning_rate: float | Choice[float]) -> None: with pytest.raises(OutOfBoundsError): AdaBoostClassifier(learning_rate=learning_rate) diff --git a/tests/safeds/ml/classical/classification/test_classifier.py b/tests/safeds/ml/classical/classification/test_classifier.py index 66342f510..a5134f118 100644 --- a/tests/safeds/ml/classical/classification/test_classifier.py +++ b/tests/safeds/ml/classical/classification/test_classifier.py @@ -9,6 +9,9 @@ from safeds.exceptions import ( DatasetMissesDataError, DatasetMissesFeaturesError, + FittingWithChoiceError, + FittingWithoutChoiceError, + LearningError, MissingValuesColumnError, ModelNotFittedError, NonNumericColumnError, @@ -24,6 +27,8 @@ RandomForestClassifier, SupportVectorClassifier, ) +from safeds.ml.hyperparameters import Choice +from safeds.ml.metrics import ClassifierMetric if TYPE_CHECKING: from _pytest.fixtures import FixtureRequest @@ -53,18 +58,132 @@ def classifiers() -> list[Classifier]: ] +def classifiers_with_choices() -> list[Classifier]: + """ + Return the list of classifiers with Choices as Parameters to test choice functionality. + + After you implemented a new classifier, add it to this list to ensure its `fit_by_exhaustive_search` method works as + expected. Place tests of methods that are specific to your classifier in a separate test file. + + Returns + ------- + classifiers : list[Classifier] + The list of classifiers to test. + """ + return [ + AdaBoostClassifier( + learner=Choice(AdaBoostClassifier(), None), + max_learner_count=Choice(1, 2), + learning_rate=Choice(0.1, 0.2), + ), + DecisionTreeClassifier(max_depth=Choice(1, 2), min_sample_count_in_leaves=Choice(1, 2)), + GradientBoostingClassifier(tree_count=Choice(1, 2), learning_rate=Choice(0.1, 0.2)), + KNearestNeighborsClassifier(neighbor_count=Choice(1, 2)), + RandomForestClassifier( + tree_count=Choice(1, 2), + max_depth=Choice(1, 2), + min_sample_count_in_leaves=Choice(1, 2), + ), + SupportVectorClassifier(kernel=Choice(None, SupportVectorClassifier.Kernel.linear()), c=Choice(0.5, 1.0)), + ] + + @pytest.fixture() def valid_data() -> TabularDataset: return Table( { - "id": [1, 4], - "feat1": [2, 5], - "feat2": [3, 6], - "target": [0, 1], + "id": [1, 4, 7, 10], + "feat1": [2, 5, 8, 11], + "feat2": [3, 6, 9, 12], + "target": [0, 1, 0, 1], }, ).to_tabular_dataset(target_name="target", extra_names=["id"]) +@pytest.mark.parametrize("classifier_with_choice", classifiers_with_choices(), ids=lambda x: x.__class__.__name__) +class TestChoiceClassifiers: + + def test_should_raise_if_model_is_fitted_with_choice( + self, + classifier_with_choice: Classifier, + valid_data: TabularDataset, + ) -> None: + with pytest.raises(FittingWithChoiceError): + classifier_with_choice.fit(valid_data) + + def test_should_raise_if_no_positive_class_is_provided( + self, + classifier_with_choice: Classifier, + valid_data: TabularDataset, + ) -> None: + with pytest.raises(LearningError): + classifier_with_choice.fit_by_exhaustive_search(valid_data, optimization_metric=ClassifierMetric.PRECISION) + + def test_workflow_with_choice_parameter( + self, + classifier_with_choice: Classifier, + valid_data: TabularDataset, + ) -> None: + model = classifier_with_choice.fit_by_exhaustive_search(valid_data, ClassifierMetric.ACCURACY) + assert isinstance(model, type(classifier_with_choice)) + pred = model.predict(valid_data) + assert isinstance(pred, TabularDataset) + + +class TestFitByExhaustiveSearch: + + @pytest.mark.parametrize("classifier", classifiers(), ids=lambda x: x.__class__.__name__) + def test_should_raise_if_model_is_fitted_by_exhaustive_search_without_choice( + self, + classifier: Classifier, + valid_data: TabularDataset, + ) -> None: + with pytest.raises(FittingWithoutChoiceError): + classifier.fit_by_exhaustive_search(valid_data, optimization_metric=ClassifierMetric.ACCURACY) + + @pytest.mark.parametrize( + ("metric", "positive_class"), + [ + ( + ClassifierMetric.ACCURACY, + None, + ), + ( + ClassifierMetric.PRECISION, + 0, + ), + ( + ClassifierMetric.RECALL, + 0, + ), + ( + ClassifierMetric.F1_SCORE, + 0, + ), + ], + ids=["accuracy", "precision", "recall", "f1_score"], + ) + def test_should_check_return_type_with_metric( + self, + valid_data: TabularDataset, + metric: ClassifierMetric, + positive_class: Any, + ) -> None: + fitted_model = AdaBoostClassifier(max_learner_count=Choice(2, 3)).fit_by_exhaustive_search( + valid_data, + optimization_metric=metric, + positive_class=positive_class, + ) + assert isinstance(fitted_model, AdaBoostClassifier) + + def test_should_raise_when_dataset_misses_data(self) -> None: + with pytest.raises(DatasetMissesDataError): + AdaBoostClassifier(max_learner_count=Choice(2, 3)).fit_by_exhaustive_search( + Table.from_dict({"a": [], "b": []}).to_tabular_dataset("a"), + ClassifierMetric.ACCURACY, + ) + + @pytest.mark.parametrize("classifier", classifiers(), ids=lambda x: x.__class__.__name__) class TestFit: def test_should_succeed_on_valid_data(self, classifier: Classifier, valid_data: TabularDataset) -> None: diff --git a/tests/safeds/ml/classical/classification/test_decision_tree.py b/tests/safeds/ml/classical/classification/test_decision_tree.py index 6642ac0a6..9c05681dc 100644 --- a/tests/safeds/ml/classical/classification/test_decision_tree.py +++ b/tests/safeds/ml/classical/classification/test_decision_tree.py @@ -3,6 +3,7 @@ from safeds.data.tabular.containers import Table from safeds.exceptions import ModelNotFittedError, OutOfBoundsError from safeds.ml.classical.classification import DecisionTreeClassifier +from safeds.ml.hyperparameters import Choice from syrupy import SnapshotAssertion from tests.helpers import os_mac, skip_if_os @@ -24,8 +25,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.max_depth == 2 - @pytest.mark.parametrize("max_depth", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, max_depth: int) -> None: + @pytest.mark.parametrize("max_depth", [-1, 0, Choice(-1)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, max_depth: int | None | Choice[int | None]) -> None: with pytest.raises(OutOfBoundsError): DecisionTreeClassifier(max_depth=max_depth) @@ -40,8 +41,12 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.min_samples_leaf == 2 - @pytest.mark.parametrize("min_sample_count_in_leaves", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, min_sample_count_in_leaves: int) -> None: + @pytest.mark.parametrize( + "min_sample_count_in_leaves", + [-1, 0, Choice(-1)], + ids=["minus_one", "zero", "invalid_choice"], + ) + def test_should_raise_if_less_than_or_equal_to_0(self, min_sample_count_in_leaves: int | Choice[int]) -> None: with pytest.raises(OutOfBoundsError): DecisionTreeClassifier(min_sample_count_in_leaves=min_sample_count_in_leaves) diff --git a/tests/safeds/ml/classical/classification/test_gradient_boosting.py b/tests/safeds/ml/classical/classification/test_gradient_boosting.py index 31f62d822..14aae9aa2 100644 --- a/tests/safeds/ml/classical/classification/test_gradient_boosting.py +++ b/tests/safeds/ml/classical/classification/test_gradient_boosting.py @@ -3,6 +3,7 @@ from safeds.data.tabular.containers import Table from safeds.exceptions import OutOfBoundsError from safeds.ml.classical.classification import GradientBoostingClassifier +from safeds.ml.hyperparameters import Choice @pytest.fixture() @@ -21,8 +22,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.n_estimators == 2 - @pytest.mark.parametrize("tree_count", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_1(self, tree_count: int) -> None: + @pytest.mark.parametrize("tree_count", [-1, 0, Choice(-1)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_1(self, tree_count: int | Choice[int]) -> None: with pytest.raises(OutOfBoundsError): GradientBoostingClassifier(tree_count=tree_count) @@ -37,7 +38,7 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.learning_rate == 2 - @pytest.mark.parametrize("learning_rate", [-1.0, 0.0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, learning_rate: float) -> None: + @pytest.mark.parametrize("learning_rate", [-1.0, 0.0, Choice(-1.0)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, learning_rate: float | Choice[float]) -> None: with pytest.raises(OutOfBoundsError): GradientBoostingClassifier(learning_rate=learning_rate) diff --git a/tests/safeds/ml/classical/classification/test_k_nearest_neighbors.py b/tests/safeds/ml/classical/classification/test_k_nearest_neighbors.py index 1cd420ea8..66437d782 100644 --- a/tests/safeds/ml/classical/classification/test_k_nearest_neighbors.py +++ b/tests/safeds/ml/classical/classification/test_k_nearest_neighbors.py @@ -3,6 +3,7 @@ from safeds.data.tabular.containers import Table from safeds.exceptions import OutOfBoundsError from safeds.ml.classical.classification import KNearestNeighborsClassifier +from safeds.ml.hyperparameters import Choice @pytest.fixture() @@ -21,8 +22,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.n_neighbors == 2 - @pytest.mark.parametrize("neighbor_count", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, neighbor_count: int) -> None: + @pytest.mark.parametrize("neighbor_count", [-1, 0, Choice(-1)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, neighbor_count: int | Choice[int]) -> None: with pytest.raises(OutOfBoundsError): KNearestNeighborsClassifier(neighbor_count=neighbor_count) diff --git a/tests/safeds/ml/classical/classification/test_random_forest.py b/tests/safeds/ml/classical/classification/test_random_forest.py index 2fe1950a2..8f9efabad 100644 --- a/tests/safeds/ml/classical/classification/test_random_forest.py +++ b/tests/safeds/ml/classical/classification/test_random_forest.py @@ -3,6 +3,7 @@ from safeds.data.tabular.containers import Table from safeds.exceptions import OutOfBoundsError from safeds.ml.classical.classification import RandomForestClassifier +from safeds.ml.hyperparameters import Choice @pytest.fixture() @@ -21,8 +22,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.n_estimators == 2 - @pytest.mark.parametrize("tree_count", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, tree_count: int) -> None: + @pytest.mark.parametrize("tree_count", [-1, 0, Choice(-1)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, tree_count: int | Choice[int]) -> None: with pytest.raises(OutOfBoundsError): RandomForestClassifier(tree_count=tree_count) @@ -37,8 +38,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.max_depth == 2 - @pytest.mark.parametrize("max_depth", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, max_depth: int) -> None: + @pytest.mark.parametrize("max_depth", [-1, 0, Choice(-1)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, max_depth: int | None | Choice[int | None]) -> None: with pytest.raises(OutOfBoundsError): RandomForestClassifier(max_depth=max_depth) @@ -53,7 +54,11 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.min_samples_leaf == 2 - @pytest.mark.parametrize("min_sample_count_in_leaves", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, min_sample_count_in_leaves: int) -> None: + @pytest.mark.parametrize( + "min_sample_count_in_leaves", + [-1, 0, Choice(-1)], + ids=["minus_one", "zero", "invalid_choice"], + ) + def test_should_raise_if_less_than_or_equal_to_0(self, min_sample_count_in_leaves: int | Choice[int]) -> None: with pytest.raises(OutOfBoundsError): RandomForestClassifier(min_sample_count_in_leaves=min_sample_count_in_leaves) diff --git a/tests/safeds/ml/classical/classification/test_support_vector_machine.py b/tests/safeds/ml/classical/classification/test_support_vector_machine.py index a601d5cf8..6fae902ed 100644 --- a/tests/safeds/ml/classical/classification/test_support_vector_machine.py +++ b/tests/safeds/ml/classical/classification/test_support_vector_machine.py @@ -6,6 +6,7 @@ from safeds.exceptions import OutOfBoundsError from safeds.ml.classical._bases._support_vector_machine_base import _Linear, _Polynomial from safeds.ml.classical.classification import SupportVectorClassifier +from safeds.ml.hyperparameters import Choice def kernels() -> list[SupportVectorClassifier.Kernel]: @@ -44,8 +45,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.C == 2 - @pytest.mark.parametrize("c", [-1.0, 0.0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, c: float) -> None: + @pytest.mark.parametrize("c", [-1.0, 0.0, Choice(-1.0)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, c: float | Choice[float]) -> None: with pytest.raises(OutOfBoundsError): SupportVectorClassifier(c=c) diff --git a/tests/safeds/ml/classical/regression/test_ada_boost.py b/tests/safeds/ml/classical/regression/test_ada_boost.py index 36cd9cb64..58eebdd97 100644 --- a/tests/safeds/ml/classical/regression/test_ada_boost.py +++ b/tests/safeds/ml/classical/regression/test_ada_boost.py @@ -3,6 +3,7 @@ from safeds.data.tabular.containers import Table from safeds.exceptions import OutOfBoundsError from safeds.ml.classical.regression import AdaBoostRegressor +from safeds.ml.hyperparameters import Choice @pytest.fixture() @@ -34,8 +35,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.n_estimators == 2 - @pytest.mark.parametrize("max_learner_count", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, max_learner_count: int) -> None: + @pytest.mark.parametrize("max_learner_count", [-1, 0, Choice(-1)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, max_learner_count: int | Choice[int]) -> None: with pytest.raises(OutOfBoundsError): AdaBoostRegressor(max_learner_count=max_learner_count) @@ -50,7 +51,7 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.learning_rate == 2 - @pytest.mark.parametrize("learning_rate", [-1.0, 0.0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, learning_rate: float) -> None: + @pytest.mark.parametrize("learning_rate", [-1.0, 0.0, Choice(-1.0)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, learning_rate: float | Choice[float]) -> None: with pytest.raises(OutOfBoundsError): AdaBoostRegressor(learning_rate=learning_rate) diff --git a/tests/safeds/ml/classical/regression/test_arima_model.py b/tests/safeds/ml/classical/regression/test_arima_model.py index 0e68b7160..06a114bc7 100644 --- a/tests/safeds/ml/classical/regression/test_arima_model.py +++ b/tests/safeds/ml/classical/regression/test_arima_model.py @@ -9,7 +9,7 @@ ModelNotFittedError, NonNumericColumnError, ) -from safeds.ml.classical.regression import ArimaModelRegressor, LassoRegressor +from safeds.ml.classical.regression import AdaBoostRegressor, ArimaModelRegressor from tests.helpers import resolve_resource_path @@ -177,7 +177,7 @@ def test_should_return_same_hash_for_equal_regressor() -> None: def test_should_return_different_hash_for_unequal_regressor() -> None: regressor1 = ArimaModelRegressor() - regressor2 = LassoRegressor() + regressor2 = AdaBoostRegressor() assert hash(regressor1) != hash(regressor2) diff --git a/tests/safeds/ml/classical/regression/test_decision_tree.py b/tests/safeds/ml/classical/regression/test_decision_tree.py index 6a39e1968..4999a85cb 100644 --- a/tests/safeds/ml/classical/regression/test_decision_tree.py +++ b/tests/safeds/ml/classical/regression/test_decision_tree.py @@ -3,6 +3,7 @@ from safeds.data.tabular.containers import Table from safeds.exceptions import ModelNotFittedError, OutOfBoundsError from safeds.ml.classical.regression import DecisionTreeRegressor +from safeds.ml.hyperparameters import Choice from syrupy import SnapshotAssertion @@ -22,8 +23,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.max_depth == 2 - @pytest.mark.parametrize("max_depth", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, max_depth: int) -> None: + @pytest.mark.parametrize("max_depth", [-1, 0, Choice(-1)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, max_depth: int | None | Choice[int | None]) -> None: with pytest.raises(OutOfBoundsError): DecisionTreeRegressor(max_depth=max_depth) @@ -38,8 +39,12 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.min_samples_leaf == 2 - @pytest.mark.parametrize("min_sample_count_in_leaves", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, min_sample_count_in_leaves: int) -> None: + @pytest.mark.parametrize( + "min_sample_count_in_leaves", + [-1, 0, Choice(-1)], + ids=["minus_one", "zero", "invalid_choice"], + ) + def test_should_raise_if_less_than_or_equal_to_0(self, min_sample_count_in_leaves: int | Choice[int]) -> None: with pytest.raises(OutOfBoundsError): DecisionTreeRegressor(min_sample_count_in_leaves=min_sample_count_in_leaves) diff --git a/tests/safeds/ml/classical/regression/test_elastic_net_regression.py b/tests/safeds/ml/classical/regression/test_elastic_net_regression.py deleted file mode 100644 index 1cb19d1cf..000000000 --- a/tests/safeds/ml/classical/regression/test_elastic_net_regression.py +++ /dev/null @@ -1,73 +0,0 @@ -import pytest -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table -from safeds.exceptions import OutOfBoundsError -from safeds.ml.classical.regression import ElasticNetRegressor - - -@pytest.fixture() -def training_set() -> TabularDataset: - table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1") - - -class TestAlpha: - def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> None: - fitted_model = ElasticNetRegressor(alpha=1).fit(training_set) - assert fitted_model.alpha == 1 - - def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: - fitted_model = ElasticNetRegressor(alpha=1).fit(training_set) - assert fitted_model._wrapped_model is not None - assert fitted_model._wrapped_model.alpha == 1 - - @pytest.mark.parametrize("alpha", [-0.5], ids=["minus_0_point_5"]) - def test_should_raise_if_less_than_0(self, alpha: float) -> None: - with pytest.raises(OutOfBoundsError): - ElasticNetRegressor(alpha=alpha) - - def test_should_warn_if_equal_to_0(self) -> None: - with pytest.warns( - UserWarning, - match=( - "Setting alpha to zero makes this model equivalent to LinearRegression. You " - "should use LinearRegression instead for better numerical stability." - ), - ): - ElasticNetRegressor(alpha=0) - - -class TestLassoRatio: - def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> None: - fitted_model = ElasticNetRegressor(lasso_ratio=0.3).fit(training_set) - assert fitted_model.lasso_ratio == 0.3 - - def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: - fitted_model = ElasticNetRegressor(lasso_ratio=0.3).fit(training_set) - assert fitted_model._wrapped_model is not None - assert fitted_model._wrapped_model.l1_ratio == 0.3 - - @pytest.mark.parametrize("lasso_ratio", [-0.5, 1.5], ids=["minus_zero_point_5", "one_point_5"]) - def test_should_raise_if_not_between_0_and_1(self, lasso_ratio: float) -> None: - with pytest.raises(OutOfBoundsError): - ElasticNetRegressor(lasso_ratio=lasso_ratio) - - def test_should_warn_if_0(self) -> None: - with pytest.warns( - UserWarning, - match=( - "ElasticNetRegression with lasso_ratio = 0 is essentially RidgeRegression." - " Use RidgeRegression instead for better numerical stability." - ), - ): - ElasticNetRegressor(lasso_ratio=0) - - def test_should_warn_if_1(self) -> None: - with pytest.warns( - UserWarning, - match=( - "ElasticNetRegression with lasso_ratio = 0 is essentially LassoRegression." - " Use LassoRegression instead for better numerical stability." - ), - ): - ElasticNetRegressor(lasso_ratio=1) diff --git a/tests/safeds/ml/classical/regression/test_gradient_boosting.py b/tests/safeds/ml/classical/regression/test_gradient_boosting.py index f72a5a9fd..99c37277a 100644 --- a/tests/safeds/ml/classical/regression/test_gradient_boosting.py +++ b/tests/safeds/ml/classical/regression/test_gradient_boosting.py @@ -3,6 +3,7 @@ from safeds.data.tabular.containers import Table from safeds.exceptions import OutOfBoundsError from safeds.ml.classical.regression import GradientBoostingRegressor +from safeds.ml.hyperparameters import Choice @pytest.fixture() @@ -21,8 +22,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.n_estimators == 2 - @pytest.mark.parametrize("tree_count", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_1(self, tree_count: int) -> None: + @pytest.mark.parametrize("tree_count", [-1, 0, Choice(-1)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_1(self, tree_count: int | Choice[int]) -> None: with pytest.raises(OutOfBoundsError): GradientBoostingRegressor(tree_count=tree_count) @@ -37,7 +38,7 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.learning_rate == 2 - @pytest.mark.parametrize("learning_rate", [-1.0, 0.0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, learning_rate: float) -> None: + @pytest.mark.parametrize("learning_rate", [-1.0, 0.0, Choice(-1.0)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, learning_rate: float | Choice[float]) -> None: with pytest.raises(OutOfBoundsError): GradientBoostingRegressor(learning_rate=learning_rate) diff --git a/tests/safeds/ml/classical/regression/test_k_nearest_neighbors.py b/tests/safeds/ml/classical/regression/test_k_nearest_neighbors.py index 1bd09af73..1e1342d56 100644 --- a/tests/safeds/ml/classical/regression/test_k_nearest_neighbors.py +++ b/tests/safeds/ml/classical/regression/test_k_nearest_neighbors.py @@ -3,6 +3,7 @@ from safeds.data.tabular.containers import Table from safeds.exceptions import OutOfBoundsError from safeds.ml.classical.regression import KNearestNeighborsRegressor +from safeds.ml.hyperparameters import Choice @pytest.fixture() @@ -21,8 +22,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.n_neighbors == 2 - @pytest.mark.parametrize("neighbor_count", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, neighbor_count: int) -> None: + @pytest.mark.parametrize("neighbor_count", [-1, 0, Choice(-1)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, neighbor_count: int | Choice[int]) -> None: with pytest.raises(OutOfBoundsError): KNearestNeighborsRegressor(neighbor_count=neighbor_count) diff --git a/tests/safeds/ml/classical/regression/test_lasso_regression.py b/tests/safeds/ml/classical/regression/test_lasso_regression.py deleted file mode 100644 index 294b8b421..000000000 --- a/tests/safeds/ml/classical/regression/test_lasso_regression.py +++ /dev/null @@ -1,37 +0,0 @@ -import pytest -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table -from safeds.exceptions import OutOfBoundsError -from safeds.ml.classical.regression import LassoRegressor - - -@pytest.fixture() -def training_set() -> TabularDataset: - table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1") - - -class TestAlpha: - def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> None: - fitted_model = LassoRegressor(alpha=1).fit(training_set) - assert fitted_model.alpha == 1 - - def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: - fitted_model = LassoRegressor(alpha=1).fit(training_set) - assert fitted_model._wrapped_model is not None - assert fitted_model._wrapped_model.alpha == 1 - - @pytest.mark.parametrize("alpha", [-0.5], ids=["minus_zero_point_5"]) - def test_should_raise_if_less_than_0(self, alpha: float) -> None: - with pytest.raises(OutOfBoundsError): - LassoRegressor(alpha=alpha) - - def test_should_warn_if_equal_to_0(self) -> None: - with pytest.warns( - UserWarning, - match=( - "Setting alpha to zero makes this model equivalent to LinearRegression. You " - "should use LinearRegression instead for better numerical stability." - ), - ): - LassoRegressor(alpha=0) diff --git a/tests/safeds/ml/classical/regression/test_linear_regressor.py b/tests/safeds/ml/classical/regression/test_linear_regressor.py new file mode 100644 index 000000000..0334e58ea --- /dev/null +++ b/tests/safeds/ml/classical/regression/test_linear_regressor.py @@ -0,0 +1,160 @@ +import sys + +import pytest +from safeds.data.labeled.containers import TabularDataset +from safeds.data.tabular.containers import Table +from safeds.exceptions import OutOfBoundsError +from safeds.ml.classical.regression._linear_regressor import LinearRegressor, _Linear +from safeds.ml.hyperparameters import Choice + + +def penalties() -> list[LinearRegressor.Penalty]: + """ + Return the list of penalties to test. + + After you implemented a new penalty, add it to this list to ensure its `__hash__` and `__eq__` method work as + expected. + + Returns + ------- + penalties: + The list of penalties to test. + """ + return [ + LinearRegressor.Penalty.linear(), + LinearRegressor.Penalty.ridge(), + LinearRegressor.Penalty.lasso(), + LinearRegressor.Penalty.elastic_net(), + ] + + +@pytest.fixture() +def training_set() -> TabularDataset: + table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) + return table.to_tabular_dataset(target_name="col1") + + +class TestPenalty: + def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> None: + penalty = LinearRegressor.Penalty.linear() + fitted_model = LinearRegressor(penalty=penalty).fit(training_set=training_set) + assert isinstance(fitted_model.penalty, _Linear) + assert fitted_model._wrapped_model is not None + + @pytest.mark.parametrize( + ("penalty1", "penalty2"), + ([(x, y) for x in penalties() for y in penalties() if x.__class__ == y.__class__]), + ids=lambda x: x.__class__.__name__, + ) + def test_equal_penalties( + self, + penalty1: LinearRegressor.Penalty, + penalty2: LinearRegressor.Penalty, + ) -> None: + assert penalty1 == penalty2 + + @pytest.mark.parametrize( + ("penalty1", "penalty2"), + ([(x, y) for x in penalties() for y in penalties() if x.__class__ != y.__class__]), + ids=lambda x: x.__class__.__name__, + ) + def test_unequal_penalties( + self, + penalty1: LinearRegressor.Penalty, + penalty2: LinearRegressor.Penalty, + ) -> None: + assert penalty1 != penalty2 + + @pytest.mark.parametrize( + ("penalty1", "penalty2"), + ([(x, y) for x in penalties() for y in penalties() if x.__class__ == y.__class__]), + ids=lambda x: x.__class__.__name__, + ) + def test_should_return_same_hash_for_equal_penalties( + self, + penalty1: LinearRegressor.Penalty, + penalty2: LinearRegressor.Penalty, + ) -> None: + assert hash(penalty1) == hash(penalty2) + + @pytest.mark.parametrize( + ("penalty1", "penalty2"), + ([(x, y) for x in penalties() for y in penalties() if x.__class__ != y.__class__]), + ids=lambda x: x.__class__.__name__, + ) + def test_should_return_different_hash_for_unequal_penalties( + self, + penalty1: LinearRegressor.Penalty, + penalty2: LinearRegressor.Penalty, + ) -> None: + assert hash(penalty1) != hash(penalty2) + + @pytest.mark.parametrize( + "penalty", + ([LinearRegressor.Penalty.ridge(), LinearRegressor.Penalty.lasso(), LinearRegressor.Penalty.elastic_net()]), + ids=lambda x: x.__class__.__name__, + ) + def test_sizeof_kernel( + self, + penalty: LinearRegressor.Penalty, + ) -> None: + assert sys.getsizeof(penalty) > sys.getsizeof(object()) + + class TestLinear: + def test_str(self) -> None: + linear_penalty = LinearRegressor.Penalty.linear() + assert linear_penalty.__str__() == "Linear" + + class TestRidge: + def test_str(self) -> None: + ridge_penalty = LinearRegressor.Penalty.ridge(0.5) + assert ridge_penalty.__str__() == f"Ridge(alpha={0.5})" + + @pytest.mark.parametrize("alpha", [-0.5, Choice(-0.5)], ids=["minus_zero_point_five", "invalid_choice"]) + def test_should_raise_if_alpha_out_of_bounds_ridge(self, alpha: float | Choice[float]) -> None: + with pytest.raises(OutOfBoundsError): + LinearRegressor(penalty=LinearRegressor.Penalty.ridge(alpha=alpha)) + + def test_should_assert_alpha_is_set_correctly(self) -> None: + alpha = 0.69 + assert LinearRegressor.Penalty.ridge(alpha=alpha).alpha == alpha # type: ignore[attr-defined] + + class TestLasso: + def test_str(self) -> None: + lasso_penalty = LinearRegressor.Penalty.lasso(0.5) + assert lasso_penalty.__str__() == f"Lasso(alpha={0.5})" + + @pytest.mark.parametrize("alpha", [-0.5, Choice(-0.5)], ids=["minus_zero_point_five", "invalid_choice"]) + def test_should_raise_if_alpha_out_of_bounds_lasso(self, alpha: float | Choice[float]) -> None: + with pytest.raises(OutOfBoundsError): + LinearRegressor(penalty=LinearRegressor.Penalty.lasso(alpha=alpha)) + + def test_should_assert_alpha_is_set_correctly(self) -> None: + alpha = 0.69 + assert LinearRegressor.Penalty.lasso(alpha=alpha).alpha == alpha # type: ignore[attr-defined] + + class TestElasticNet: + def test_str(self) -> None: + elastic_net_penalty = LinearRegressor.Penalty.elastic_net(0.5, 0.75) + assert elastic_net_penalty.__str__() == f"ElasticNet(alpha={0.5}, lasso_ratio={0.75})" + + @pytest.mark.parametrize("alpha", [-0.5, Choice(-0.5)], ids=["minus_zero_point_five", "invalid_choice"]) + def test_should_raise_if_alpha_out_of_bounds(self, alpha: float | Choice[float]) -> None: + with pytest.raises(OutOfBoundsError): + LinearRegressor(penalty=LinearRegressor.Penalty.elastic_net(alpha=alpha)) + + @pytest.mark.parametrize( + "lasso_ratio", + [-0.5, 1.5, Choice(-0.5)], + ids=["minus_zero_point_five", "one_point_five", "invalid_choice"], + ) + def test_should_raise_if_lasso_ratio_out_of_bounds(self, lasso_ratio: float | Choice[float]) -> None: + with pytest.raises(OutOfBoundsError): + LinearRegressor(penalty=LinearRegressor.Penalty.elastic_net(lasso_ratio=lasso_ratio)) + + def test_should_assert_alpha_is_set_correctly(self) -> None: + alpha = 0.69 + lasso_ratio = 0.96 + elastic_pen = LinearRegressor.Penalty.elastic_net(alpha=alpha, lasso_ratio=lasso_ratio) + assert elastic_pen.alpha == alpha # type: ignore[attr-defined] + assert elastic_pen.lasso_ratio == lasso_ratio # type: ignore[attr-defined] diff --git a/tests/safeds/ml/classical/regression/test_random_forest.py b/tests/safeds/ml/classical/regression/test_random_forest.py index b1b12fb26..40c84108d 100644 --- a/tests/safeds/ml/classical/regression/test_random_forest.py +++ b/tests/safeds/ml/classical/regression/test_random_forest.py @@ -3,6 +3,7 @@ from safeds.data.tabular.containers import Table from safeds.exceptions import OutOfBoundsError from safeds.ml.classical.regression import RandomForestRegressor +from safeds.ml.hyperparameters import Choice @pytest.fixture() @@ -21,8 +22,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.n_estimators == 2 - @pytest.mark.parametrize("tree_count", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, tree_count: int) -> None: + @pytest.mark.parametrize("tree_count", [-1, 0, Choice(-1)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, tree_count: int | Choice[int]) -> None: with pytest.raises(OutOfBoundsError): RandomForestRegressor(tree_count=tree_count) @@ -37,8 +38,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.max_depth == 2 - @pytest.mark.parametrize("max_depth", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, max_depth: int) -> None: + @pytest.mark.parametrize("max_depth", [-1, 0, Choice(-1)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, max_depth: int | None | Choice[int | None]) -> None: with pytest.raises(OutOfBoundsError): RandomForestRegressor(max_depth=max_depth) @@ -53,7 +54,11 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.min_samples_leaf == 2 - @pytest.mark.parametrize("min_sample_count_in_leaves", [-1, 0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, min_sample_count_in_leaves: int) -> None: + @pytest.mark.parametrize( + "min_sample_count_in_leaves", + [-1, 0, Choice(-1)], + ids=["minus_one", "zero", "invalid_choice"], + ) + def test_should_raise_if_less_than_or_equal_to_0(self, min_sample_count_in_leaves: int | Choice[int]) -> None: with pytest.raises(OutOfBoundsError): RandomForestRegressor(min_sample_count_in_leaves=min_sample_count_in_leaves) diff --git a/tests/safeds/ml/classical/regression/test_regressor.py b/tests/safeds/ml/classical/regression/test_regressor.py index 07207c775..340fe8d7c 100644 --- a/tests/safeds/ml/classical/regression/test_regressor.py +++ b/tests/safeds/ml/classical/regression/test_regressor.py @@ -10,6 +10,8 @@ ColumnLengthMismatchError, DatasetMissesDataError, DatasetMissesFeaturesError, + FittingWithChoiceError, + FittingWithoutChoiceError, MissingValuesColumnError, ModelNotFittedError, NonNumericColumnError, @@ -18,17 +20,16 @@ from safeds.ml.classical.regression import ( AdaBoostRegressor, DecisionTreeRegressor, - ElasticNetRegressor, GradientBoostingRegressor, KNearestNeighborsRegressor, - LassoRegressor, LinearRegressor, RandomForestRegressor, Regressor, - RidgeRegressor, SupportVectorRegressor, ) from safeds.ml.classical.regression._regressor import _check_metrics_preconditions +from safeds.ml.hyperparameters import Choice +from safeds.ml.metrics import RegressorMetric if TYPE_CHECKING: from _pytest.fixtures import FixtureRequest @@ -50,29 +51,120 @@ def regressors() -> list[Regressor]: return [ AdaBoostRegressor(), DecisionTreeRegressor(), - ElasticNetRegressor(), GradientBoostingRegressor(), KNearestNeighborsRegressor(2), - LassoRegressor(), LinearRegressor(), RandomForestRegressor(), - RidgeRegressor(), SupportVectorRegressor(), ] +def regressors_with_choices() -> list[Regressor]: + """ + Return the list of regressors with Choices as Parameters to test choice functionality. + + After you implemented a new regressor, add it to this list to ensure its `fit_by_exhaustive_search` method works as + expected. Place tests of methods that are specific to your regressor in a separate test file. + + Returns + ------- + regressors : list[Regressor] + The list of regressors to test. + """ + return [ + AdaBoostRegressor( + learner=Choice(AdaBoostRegressor(), None), + max_learner_count=Choice(1, 2), + learning_rate=Choice(0.1, 0.2), + ), + DecisionTreeRegressor(max_depth=Choice(1, 2), min_sample_count_in_leaves=Choice(1, 2)), + GradientBoostingRegressor(tree_count=Choice(1, 2), learning_rate=Choice(0.1, 0.2)), + KNearestNeighborsRegressor(neighbor_count=Choice(1, 2)), + LinearRegressor( + penalty=Choice( + None, + LinearRegressor.Penalty.linear(), + ), + ), + LinearRegressor(penalty=LinearRegressor.Penalty.lasso(alpha=Choice(0.25, 0.75))), + LinearRegressor(penalty=LinearRegressor.Penalty.ridge(alpha=Choice(0.25, 0.75))), + LinearRegressor( + penalty=LinearRegressor.Penalty.elastic_net(alpha=Choice(1.0, 2.0), lasso_ratio=Choice(0.1, 0.9)), + ), + RandomForestRegressor(tree_count=Choice(1, 2), max_depth=Choice(1, 2), min_sample_count_in_leaves=Choice(1, 2)), + SupportVectorRegressor(kernel=Choice(None, SupportVectorRegressor.Kernel.linear()), c=Choice(0.5, 1.0)), + ] + + @pytest.fixture() def valid_data() -> TabularDataset: return Table( { - "id": [1, 4], - "feat1": [2, 5], - "feat2": [3, 6], - "target": [0, 1], + "id": [1, 4, 7, 10], + "feat1": [2, 5, 8, 11], + "feat2": [3, 6, 9, 12], + "target": [0, 1, 0, 1], }, ).to_tabular_dataset(target_name="target", extra_names=["id"]) +@pytest.mark.parametrize("regressor_with_choice", regressors_with_choices(), ids=lambda x: x.__class__.__name__) +class TestChoiceRegressors: + + def test_workflow_with_choice_parameter(self, regressor_with_choice: Regressor, valid_data: TabularDataset) -> None: + model = regressor_with_choice.fit_by_exhaustive_search(valid_data, RegressorMetric.MEAN_SQUARED_ERROR) + assert isinstance(model, type(regressor_with_choice)) + pred = model.predict(valid_data) + assert isinstance(pred, TabularDataset) + + def test_should_raise_if_model_is_fitted_with_choice( + self, + regressor_with_choice: Regressor, + valid_data: TabularDataset, + ) -> None: + with pytest.raises(FittingWithChoiceError): + regressor_with_choice.fit(valid_data) + + +class TestFitByExhaustiveSearch: + @pytest.mark.parametrize("regressor", regressors(), ids=lambda x: x.__class__.__name__) + def test_should_raise_if_model_is_fitted_by_exhaustive_search_without_choice( + self, + regressor: Regressor, + valid_data: TabularDataset, + ) -> None: + with pytest.raises(FittingWithoutChoiceError): + regressor.fit_by_exhaustive_search(valid_data, optimization_metric=RegressorMetric.MEAN_SQUARED_ERROR) + + @pytest.mark.parametrize( + "metric", + [ + RegressorMetric.MEAN_SQUARED_ERROR, + RegressorMetric.MEAN_ABSOLUTE_ERROR, + RegressorMetric.MEDIAN_ABSOLUTE_DEVIATION, + RegressorMetric.COEFFICIENT_OF_DETERMINATION, + ], + ids=["mean_squared_error", "mean_absolute_error", "median_absolute_deviation", "coefficient_of_determination"], + ) + def test_should_check_return_type_with_metric( + self, + valid_data: TabularDataset, + metric: RegressorMetric, + ) -> None: + fitted_model = AdaBoostRegressor(max_learner_count=Choice(2, 3)).fit_by_exhaustive_search( + valid_data, + optimization_metric=metric, + ) + assert isinstance(fitted_model, AdaBoostRegressor) + + def test_should_raise_when_dataset_misses_data(self) -> None: + with pytest.raises(DatasetMissesDataError): + AdaBoostRegressor(max_learner_count=Choice(2, 3)).fit_by_exhaustive_search( + Table.from_dict({"a": [], "b": []}).to_tabular_dataset("a"), + RegressorMetric.MEAN_SQUARED_ERROR, + ) + + @pytest.mark.parametrize("regressor", regressors(), ids=lambda x: x.__class__.__name__) class TestFit: def test_should_succeed_on_valid_data(self, regressor: Regressor, valid_data: TabularDataset) -> None: diff --git a/tests/safeds/ml/classical/regression/test_ridge_regression.py b/tests/safeds/ml/classical/regression/test_ridge_regression.py deleted file mode 100644 index 141c526bc..000000000 --- a/tests/safeds/ml/classical/regression/test_ridge_regression.py +++ /dev/null @@ -1,37 +0,0 @@ -import pytest -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table -from safeds.exceptions import OutOfBoundsError -from safeds.ml.classical.regression import RidgeRegressor - - -@pytest.fixture() -def training_set() -> TabularDataset: - table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1") - - -class TestAlpha: - def test_should_be_passed_to_fitted_model(self, training_set: TabularDataset) -> None: - fitted_model = RidgeRegressor(alpha=1).fit(training_set) - assert fitted_model.alpha == 1 - - def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None: - fitted_model = RidgeRegressor(alpha=1).fit(training_set) - assert fitted_model._wrapped_model is not None - assert fitted_model._wrapped_model.alpha == 1 - - @pytest.mark.parametrize("alpha", [-0.5], ids=["minus_zero_point_5"]) - def test_should_raise_if_less_than_0(self, alpha: float) -> None: - with pytest.raises(OutOfBoundsError): - RidgeRegressor(alpha=alpha) - - def test_should_warn_if_equal_to_0(self) -> None: - with pytest.warns( - UserWarning, - match=( - "Setting alpha to zero makes this model equivalent to LinearRegression. You " - "should use LinearRegression instead for better numerical stability." - ), - ): - RidgeRegressor(alpha=0) diff --git a/tests/safeds/ml/classical/regression/test_support_vector_machine.py b/tests/safeds/ml/classical/regression/test_support_vector_machine.py index 173e688b2..86d79fbd6 100644 --- a/tests/safeds/ml/classical/regression/test_support_vector_machine.py +++ b/tests/safeds/ml/classical/regression/test_support_vector_machine.py @@ -6,6 +6,7 @@ from safeds.exceptions import OutOfBoundsError from safeds.ml.classical._bases._support_vector_machine_base import _Linear, _Polynomial from safeds.ml.classical.regression import SupportVectorRegressor +from safeds.ml.hyperparameters import Choice def kernels() -> list[SupportVectorRegressor.Kernel]: @@ -44,8 +45,8 @@ def test_should_be_passed_to_sklearn(self, training_set: TabularDataset) -> None assert fitted_model._wrapped_model is not None assert fitted_model._wrapped_model.C == 2 - @pytest.mark.parametrize("c", [-1.0, 0.0], ids=["minus_one", "zero"]) - def test_should_raise_if_less_than_or_equal_to_0(self, c: float) -> None: + @pytest.mark.parametrize("c", [-1.0, 0.0, Choice(-1.0)], ids=["minus_one", "zero", "invalid_choice"]) + def test_should_raise_if_less_than_or_equal_to_0(self, c: float | Choice[float]) -> None: with pytest.raises(OutOfBoundsError): SupportVectorRegressor(c=c)