Skip to content

Commit

Permalink
feat: hyperparameter optimization for classical models (#843)
Browse files Browse the repository at this point in the history
Closes #264 

Adjusted classical ML-Models to support taking a Choice Parameter

New Features for Classifiers and Regressors:
- combined Linear,Lasso,Ridge and ElasticNetRegressor into
ElasticNetRegressor
- changed property methods and parameter types
- added fit_by_exhaustive_search(), to fit a model with all combinations
of given Choices
- added Errors for using the wrong fit method (fit with or
fit_by_exhaustive_search without Choice Parameter)
- added Enums ClassifierMetric and RegressorMetric, which are passed to
fit_by_exhaustive_search to determine the optimization metric
- added cross validation in fit_by_exhaustive_search
- added multiprocessing in fit_by_exhaustive_search

- added tests for all methods and classes

---------

Co-authored-by: megalinter-bot <[email protected]>
Co-authored-by: Lars Reimann <[email protected]>
  • Loading branch information
3 people authored Aug 31, 2024
1 parent b1e8933 commit d8f7491
Show file tree
Hide file tree
Showing 51 changed files with 1,530 additions and 629 deletions.
28 changes: 19 additions & 9 deletions src/safeds/ml/classical/_bases/_ada_boost_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from safeds._utils import _structural_hash
from safeds._validation import _check_bounds, _ClosedBound, _OpenBound
from safeds.ml.hyperparameters import Choice

if TYPE_CHECKING:
from safeds.ml.classical import SupervisedModel
Expand All @@ -18,16 +19,25 @@ class _AdaBoostBase(ABC):
@abstractmethod
def __init__(
self,
max_learner_count: int,
learning_rate: float,
max_learner_count: int | Choice[int],
learning_rate: float | Choice[float],
) -> None:
# Validation
_check_bounds("max_learner_count", max_learner_count, lower_bound=_ClosedBound(1))
_check_bounds("learning_rate", learning_rate, lower_bound=_OpenBound(0))
if isinstance(max_learner_count, Choice):
for mlc in max_learner_count:
_check_bounds("max_learner_count", mlc, lower_bound=_ClosedBound(1))
else:
_check_bounds("max_learner_count", max_learner_count, lower_bound=_ClosedBound(1))

if isinstance(learning_rate, Choice):
for lr in learning_rate:
_check_bounds("learning_rate", lr, lower_bound=_OpenBound(0))
else:
_check_bounds("learning_rate", learning_rate, lower_bound=_OpenBound(0))

# Hyperparameters
self._max_learner_count: int = max_learner_count
self._learning_rate: float = learning_rate
self._max_learner_count: int | Choice[int] = max_learner_count
self._learning_rate: float | Choice[float] = learning_rate

def __hash__(self) -> int:
return _structural_hash(
Expand All @@ -40,16 +50,16 @@ def __hash__(self) -> int:
# ------------------------------------------------------------------------------------------------------------------

@property
def max_learner_count(self) -> int:
def max_learner_count(self) -> int | Choice[int]:
"""The maximum number of learners in the ensemble."""
return self._max_learner_count

@property
def learning_rate(self) -> float:
def learning_rate(self) -> float | Choice[float]:
"""The learning rate."""
return self._learning_rate

@property
@abstractmethod
def learner(self) -> SupervisedModel | None:
def learner(self) -> SupervisedModel | None | Choice[SupervisedModel | None]:
"""The base learner used for training the ensemble."""
29 changes: 17 additions & 12 deletions src/safeds/ml/classical/_bases/_decision_tree_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from safeds._utils import _structural_hash
from safeds._validation import _check_bounds, _ClosedBound
from safeds.ml.hyperparameters import Choice


class _DecisionTreeBase(ABC):
Expand All @@ -14,20 +15,24 @@ class _DecisionTreeBase(ABC):
@abstractmethod
def __init__(
self,
max_depth: int | None,
min_sample_count_in_leaves: int,
max_depth: int | None | Choice[int | None],
min_sample_count_in_leaves: int | Choice[int],
) -> None:
# Validation
_check_bounds("max_depth", max_depth, lower_bound=_ClosedBound(1))
_check_bounds(
"min_sample_count_in_leaves",
min_sample_count_in_leaves,
lower_bound=_ClosedBound(1),
)
if isinstance(max_depth, Choice):
for md in max_depth:
_check_bounds("max_depth", md, lower_bound=_ClosedBound(1))
else:
_check_bounds("max_depth", max_depth, lower_bound=_ClosedBound(1))
if isinstance(min_sample_count_in_leaves, Choice):
for msc in min_sample_count_in_leaves:
_check_bounds("min_sample_count_in_leaves", msc, lower_bound=_ClosedBound(1))
else:
_check_bounds("min_sample_count_in_leaves", min_sample_count_in_leaves, lower_bound=_ClosedBound(1))

# Hyperparameters
self._max_depth: int | None = max_depth
self._min_sample_count_in_leaves: int = min_sample_count_in_leaves
self._max_depth: int | None | Choice[int | None] = max_depth
self._min_sample_count_in_leaves: int | Choice[int] = min_sample_count_in_leaves

def __hash__(self) -> int:
return _structural_hash(
Expand All @@ -40,11 +45,11 @@ def __hash__(self) -> int:
# ------------------------------------------------------------------------------------------------------------------

@property
def max_depth(self) -> int | None:
def max_depth(self) -> int | None | Choice[int | None]:
"""The maximum depth of the tree."""
return self._max_depth

@property
def min_sample_count_in_leaves(self) -> int:
def min_sample_count_in_leaves(self) -> int | Choice[int]:
"""The minimum number of samples that must remain in the leaves of the tree."""
return self._min_sample_count_in_leaves
22 changes: 16 additions & 6 deletions src/safeds/ml/classical/_bases/_gradient_boosting_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from safeds._utils import _structural_hash
from safeds._validation import _check_bounds, _ClosedBound, _OpenBound
from safeds.ml.hyperparameters import Choice


class _GradientBoostingBase(ABC):
Expand All @@ -14,12 +15,21 @@ class _GradientBoostingBase(ABC):
@abstractmethod
def __init__(
self,
tree_count: int,
learning_rate: float,
tree_count: int | Choice[int],
learning_rate: float | Choice[float],
) -> None:
# Validation
_check_bounds("tree_count", tree_count, lower_bound=_ClosedBound(1))
_check_bounds("learning_rate", learning_rate, lower_bound=_OpenBound(0))
if isinstance(tree_count, Choice):
for tc in tree_count:
_check_bounds("tree_count", tc, lower_bound=_ClosedBound(1))
else:
_check_bounds("tree_count", tree_count, lower_bound=_ClosedBound(1))

if isinstance(learning_rate, Choice):
for lr in learning_rate:
_check_bounds("learning_rate", lr, lower_bound=_OpenBound(0))
else:
_check_bounds("learning_rate", learning_rate, lower_bound=_OpenBound(0))

# Hyperparameters
self._tree_count = tree_count
Expand All @@ -36,11 +46,11 @@ def __hash__(self) -> int:
# ------------------------------------------------------------------------------------------------------------------

@property
def tree_count(self) -> int:
def tree_count(self) -> int | Choice[int]:
"""The number of trees (estimators) in the ensemble."""
return self._tree_count

@property
def learning_rate(self) -> float:
def learning_rate(self) -> float | Choice[float]:
"""The learning rate."""
return self._learning_rate
11 changes: 8 additions & 3 deletions src/safeds/ml/classical/_bases/_k_nearest_neighbors_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from safeds._utils import _structural_hash
from safeds._validation import _check_bounds, _ClosedBound
from safeds.ml.hyperparameters import Choice


class _KNearestNeighborsBase(ABC):
Expand All @@ -14,10 +15,14 @@ class _KNearestNeighborsBase(ABC):
@abstractmethod
def __init__(
self,
neighbor_count: int,
neighbor_count: int | Choice[int],
) -> None:
# Validation
_check_bounds("neighbor_count", neighbor_count, lower_bound=_ClosedBound(1))
if isinstance(neighbor_count, Choice):
for nc in neighbor_count:
_check_bounds("neighbor_count", nc, lower_bound=_ClosedBound(1))
else:
_check_bounds("neighbor_count", neighbor_count, lower_bound=_ClosedBound(1))

# Hyperparameters
self._neighbor_count = neighbor_count
Expand All @@ -32,6 +37,6 @@ def __hash__(self) -> int:
# ------------------------------------------------------------------------------------------------------------------

@property
def neighbor_count(self) -> int:
def neighbor_count(self) -> int | Choice[int]:
"""The number of neighbors used for interpolation."""
return self._neighbor_count
43 changes: 27 additions & 16 deletions src/safeds/ml/classical/_bases/_random_forest_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from safeds._utils import _structural_hash
from safeds._validation import _check_bounds, _ClosedBound
from safeds.ml.hyperparameters import Choice


class _RandomForestBase(ABC):
Expand All @@ -14,23 +15,33 @@ class _RandomForestBase(ABC):
@abstractmethod
def __init__(
self,
tree_count: int,
max_depth: int | None,
min_sample_count_in_leaves: int,
tree_count: int | Choice[int],
max_depth: int | None | Choice[int | None],
min_sample_count_in_leaves: int | Choice[int],
) -> None:
# Validation
_check_bounds("tree_count", tree_count, lower_bound=_ClosedBound(1))
_check_bounds("max_depth", max_depth, lower_bound=_ClosedBound(1))
_check_bounds(
"min_sample_count_in_leaves",
min_sample_count_in_leaves,
lower_bound=_ClosedBound(1),
)
if isinstance(tree_count, Choice):
for tc in tree_count:
_check_bounds("tree_count", tc, lower_bound=_ClosedBound(1))
else:
_check_bounds("tree_count", tree_count, lower_bound=_ClosedBound(1))

if isinstance(max_depth, Choice):
for md in max_depth:
_check_bounds("max_depth", md, lower_bound=_ClosedBound(1))
else:
_check_bounds("max_depth", max_depth, lower_bound=_ClosedBound(1))

if isinstance(min_sample_count_in_leaves, Choice):
for msc in min_sample_count_in_leaves:
_check_bounds("min_sample_count_in_leaves", msc, lower_bound=_ClosedBound(1))
else:
_check_bounds("min_sample_count_in_leaves", min_sample_count_in_leaves, lower_bound=_ClosedBound(1))

# Hyperparameters
self._tree_count: int = tree_count
self._max_depth: int | None = max_depth
self._min_sample_count_in_leaves: int = min_sample_count_in_leaves
self._tree_count: int | Choice[int] = tree_count
self._max_depth: int | None | Choice[int | None] = max_depth
self._min_sample_count_in_leaves: int | Choice[int] = min_sample_count_in_leaves

def __hash__(self) -> int:
return _structural_hash(
Expand All @@ -44,16 +55,16 @@ def __hash__(self) -> int:
# ------------------------------------------------------------------------------------------------------------------

@property
def tree_count(self) -> int:
def tree_count(self) -> int | Choice[int]:
"""The number of trees used in the random forest."""
return self._tree_count

@property
def max_depth(self) -> int | None:
def max_depth(self) -> int | None | Choice[int | None]:
"""The maximum depth of each tree."""
return self._max_depth

@property
def min_sample_count_in_leaves(self) -> int:
def min_sample_count_in_leaves(self) -> int | Choice[int]:
"""The minimum number of samples that must remain in the leaves of each tree."""
return self._min_sample_count_in_leaves
19 changes: 12 additions & 7 deletions src/safeds/ml/classical/_bases/_support_vector_machine_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from safeds._utils import _structural_hash
from safeds._validation import _check_bounds, _ClosedBound, _OpenBound
from safeds.ml.hyperparameters import Choice

if TYPE_CHECKING:
from sklearn.svm import SVC as SklearnSVC # noqa: N811
Expand Down Expand Up @@ -76,18 +77,22 @@ def sigmoid() -> _SupportVectorMachineBase.Kernel:
@abstractmethod
def __init__(
self,
c: float,
kernel: _SupportVectorMachineBase.Kernel | None,
c: float | Choice[float],
kernel: _SupportVectorMachineBase.Kernel | None | Choice[_SupportVectorMachineBase.Kernel | None],
) -> None:
if kernel is None:
kernel = _SupportVectorMachineBase.Kernel.radial_basis_function()

# Validation
_check_bounds("c", c, lower_bound=_OpenBound(0))
if isinstance(c, Choice):
for value in c:
_check_bounds("c", value, lower_bound=_OpenBound(0))
else:
_check_bounds("c", c, lower_bound=_OpenBound(0))

# Hyperparameters
self._c: float = c
self._kernel: _SupportVectorMachineBase.Kernel = kernel
self._c: float | Choice[float] = c
self._kernel: _SupportVectorMachineBase.Kernel | Choice[_SupportVectorMachineBase.Kernel | None] = kernel

def __hash__(self) -> int:
return _structural_hash(
Expand All @@ -100,14 +105,14 @@ def __hash__(self) -> int:
# ------------------------------------------------------------------------------------------------------------------

@property
def c(self) -> float:
def c(self) -> float | Choice[float]:
"""The regularization strength."""
return self._c

# This property is abstract, so subclasses must declare a public return type.
@property
@abstractmethod
def kernel(self) -> _SupportVectorMachineBase.Kernel:
def kernel(self) -> _SupportVectorMachineBase.Kernel | Choice[_SupportVectorMachineBase.Kernel | None]:
"""The type of kernel used."""


Expand Down
28 changes: 19 additions & 9 deletions src/safeds/ml/classical/_supervised_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ def fit(self, training_set: TabularDataset) -> Self:
Raises
------
PlainTableError
If a table is passed instead of a TabularDataset.
DatasetMissesDataError
If the given training set contains no data.
FittingWithChoiceError
When trying to call this method on a model with hyperparameter choices.
LearningError
If the training data contains invalid values or if the training failed.
"""
Expand All @@ -88,7 +94,8 @@ def fit(self, training_set: TabularDataset) -> Self:
if training_set.to_table().row_count == 0:
raise DatasetMissesDataError

self._check_additional_fit_preconditions(training_set)
self._check_additional_fit_preconditions()
self._check_more_additional_fit_preconditions(training_set)

wrapped_model = self._get_sklearn_model()
_fit_sklearn_model_in_place(wrapped_model, training_set)
Expand Down Expand Up @@ -234,15 +241,14 @@ def get_target_type(self) -> DataType:
# Template methods
# ------------------------------------------------------------------------------------------------------------------

def _check_additional_fit_preconditions(self, training_set: TabularDataset) -> None: # noqa: B027
"""
Check additional preconditions for fitting the model and raise an error if any are violated.
def _check_additional_fit_preconditions(self) -> None: # noqa: B027
"""Check additional preconditions for fitting the model and raise an error if any are violated."""

Parameters
----------
training_set:
The training data containing the features and target.
"""
def _check_more_additional_fit_preconditions(self, training_set: TabularDataset) -> None: # noqa: B027
"""Check additional preconditions for fitting the model and raise an error if any are violated."""

def _check_additional_fit_by_exhaustive_search_preconditions(self) -> None: # noqa: B027
"""Check additional preconditions for fitting by exhaustive search and raise an error if any are violated."""

def _check_additional_predict_preconditions(self, dataset: Table | TabularDataset) -> None: # noqa: B027
"""
Expand All @@ -254,6 +260,10 @@ def _check_additional_predict_preconditions(self, dataset: Table | TabularDatase
The dataset containing at least the features.
"""

def _get_models_for_all_choices(self) -> list[Self]:
"""Get a list of all possible models, given the Parameter Choices."""
raise NotImplementedError # pragma: no cover

@abstractmethod
def _clone(self) -> Self:
"""
Expand Down
Loading

0 comments on commit d8f7491

Please sign in to comment.