Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: regularization for decision trees and random forests #730

Merged
merged 1 commit into from
May 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 63 additions & 7 deletions src/safeds/ml/classical/classification/_decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import TYPE_CHECKING

from safeds._utils import _structural_hash
from safeds.exceptions import ClosedBound, OutOfBoundsError
from safeds.ml.classical._util_sklearn import fit, predict

from ._classifier import Classifier
Expand All @@ -16,17 +17,66 @@


class DecisionTreeClassifier(Classifier):
"""Decision tree classification."""
"""
Decision tree classification.

Parameters
----------
maximum_depth:
The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0.
minimum_number_of_samples_in_leaves:
The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0.

Raises
------
OutOfBoundsError
If `maximum_depth` is less than 1.
OutOfBoundsError
If `minimum_number_of_samples_in_leaves` is less than 1.
"""

def __init__(
self,
*,
maximum_depth: int | None = None,
minimum_number_of_samples_in_leaves: int = 1,
) -> None:
# Validation
if maximum_depth is not None and maximum_depth < 1:
raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1))
if minimum_number_of_samples_in_leaves < 1:
raise OutOfBoundsError(
minimum_number_of_samples_in_leaves,
name="minimum_number_of_samples_in_leaves",
lower_bound=ClosedBound(1),
)

# Hyperparameters
self._maximum_depth: int | None = maximum_depth
self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves

def __hash__(self) -> int:
return _structural_hash(Classifier.__hash__(self), self._target_name, self._feature_names)

def __init__(self) -> None:
# Internal state
self._wrapped_classifier: sk_DecisionTreeClassifier | None = None
self._feature_names: list[str] | None = None
self._target_name: str | None = None

def __hash__(self) -> int:
return _structural_hash(
Classifier.__hash__(self),
self._feature_names,
self._target_name,
)

@property
def maximum_depth(self) -> int | None:
"""The maximum depth of the tree."""
return self._maximum_depth

@property
def minimum_number_of_samples_in_leaves(self) -> int:
"""The minimum number of samples that must remain in the leaves of the tree."""
return self._minimum_number_of_samples_in_leaves

def fit(self, training_set: TabularDataset) -> DecisionTreeClassifier:
"""
Create a copy of this classifier and fit it with the given training data.
Expand Down Expand Up @@ -59,7 +109,10 @@ def fit(self, training_set: TabularDataset) -> DecisionTreeClassifier:
wrapped_classifier = self._get_sklearn_classifier()
fit(wrapped_classifier, training_set)

result = DecisionTreeClassifier()
result = DecisionTreeClassifier(
maximum_depth=self._maximum_depth,
minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves,
)
result._wrapped_classifier = wrapped_classifier
result._feature_names = training_set.features.column_names
result._target_name = training_set.target.name
Expand Down Expand Up @@ -105,4 +158,7 @@ def is_fitted(self) -> bool:
def _get_sklearn_classifier(self) -> ClassifierMixin:
from sklearn.tree import DecisionTreeClassifier as sk_DecisionTreeClassifier

return sk_DecisionTreeClassifier()
return sk_DecisionTreeClassifier(
max_depth=self._maximum_depth,
min_samples_leaf=self._minimum_number_of_samples_in_leaves,
)
81 changes: 60 additions & 21 deletions src/safeds/ml/classical/classification/_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,52 +17,82 @@


class RandomForestClassifier(Classifier):
"""Random forest classification.
"""
Random forest classification.

Parameters
----------
number_of_trees:
The number of trees to be used in the random forest. Has to be greater than 0.
maximum_depth:
The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0.
minimum_number_of_samples_in_leaves:
The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0.

Raises
------
OutOfBoundsError
If `number_of_trees` is less than 1.
OutOfBoundsError
If `maximum_depth` is less than 1.
OutOfBoundsError
If `minimum_number_of_samples_in_leaves` is less than 1.
"""

def __hash__(self) -> int:
return _structural_hash(
Classifier.__hash__(self),
self._target_name,
self._feature_names,
self._number_of_trees,
)

def __init__(self, *, number_of_trees: int = 100) -> None:
def __init__(
self,
*,
number_of_trees: int = 100,
maximum_depth: int | None = None,
minimum_number_of_samples_in_leaves: int = 1,
) -> None:
# Validation
if number_of_trees < 1:
raise OutOfBoundsError(number_of_trees, name="number_of_trees", lower_bound=ClosedBound(1))
if maximum_depth is not None and maximum_depth < 1:
raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1))
if minimum_number_of_samples_in_leaves < 1:
raise OutOfBoundsError(
minimum_number_of_samples_in_leaves,
name="minimum_number_of_samples_in_leaves",
lower_bound=ClosedBound(1),
)

# Hyperparameters
self._number_of_trees = number_of_trees
self._number_of_trees: int = number_of_trees
self._maximum_depth: int | None = maximum_depth
self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves

# Internal state
self._wrapped_classifier: sk_RandomForestClassifier | None = None
self._feature_names: list[str] | None = None
self._target_name: str | None = None

def __hash__(self) -> int:
return _structural_hash(
Classifier.__hash__(self),
self._feature_names,
self._target_name,
self._number_of_trees,
self._maximum_depth,
self._minimum_number_of_samples_in_leaves,
)

@property
def number_of_trees(self) -> int:
"""
Get the number of trees used in the random forest.

Returns
-------
result:
The number of trees.
"""
"""The number of trees used in the random forest."""
return self._number_of_trees

@property
def maximum_depth(self) -> int | None:
"""The maximum depth of each tree."""
return self._maximum_depth

@property
def minimum_number_of_samples_in_leaves(self) -> int:
"""The minimum number of samples that must remain in the leaves of each tree."""
return self._minimum_number_of_samples_in_leaves

def fit(self, training_set: TabularDataset) -> RandomForestClassifier:
"""
Create a copy of this classifier and fit it with the given training data.
Expand Down Expand Up @@ -95,7 +125,11 @@ def fit(self, training_set: TabularDataset) -> RandomForestClassifier:
wrapped_classifier = self._get_sklearn_classifier()
fit(wrapped_classifier, training_set)

result = RandomForestClassifier(number_of_trees=self._number_of_trees)
result = RandomForestClassifier(
number_of_trees=self._number_of_trees,
maximum_depth=self._maximum_depth,
minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves,
)
result._wrapped_classifier = wrapped_classifier
result._feature_names = training_set.features.column_names
result._target_name = training_set.target.name
Expand Down Expand Up @@ -149,4 +183,9 @@ def _get_sklearn_classifier(self) -> ClassifierMixin:
"""
from sklearn.ensemble import RandomForestClassifier as sk_RandomForestClassifier

return sk_RandomForestClassifier(self._number_of_trees, n_jobs=-1)
return sk_RandomForestClassifier(
n_estimators=self._number_of_trees,
max_depth=self._maximum_depth,
min_samples_leaf=self._minimum_number_of_samples_in_leaves,
n_jobs=-1,
)
70 changes: 63 additions & 7 deletions src/safeds/ml/classical/regression/_decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import TYPE_CHECKING

from safeds._utils import _structural_hash
from safeds.exceptions import ClosedBound, OutOfBoundsError
from safeds.ml.classical._util_sklearn import fit, predict

from ._regressor import Regressor
Expand All @@ -16,17 +17,66 @@


class DecisionTreeRegressor(Regressor):
"""Decision tree regression."""
"""
Decision tree regression.

Parameters
----------
maximum_depth:
The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0.
minimum_number_of_samples_in_leaves:
The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0.

Raises
------
OutOfBoundsError
If `maximum_depth` is less than 1.
OutOfBoundsError
If `minimum_number_of_samples_in_leaves` is less than 1.
"""

def __init__(
self,
*,
maximum_depth: int | None = None,
minimum_number_of_samples_in_leaves: int = 5,
) -> None:
# Validation
if maximum_depth is not None and maximum_depth < 1:
raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1))
if minimum_number_of_samples_in_leaves < 1:
raise OutOfBoundsError(
minimum_number_of_samples_in_leaves,
name="minimum_number_of_samples_in_leaves",
lower_bound=ClosedBound(1),
)

# Hyperparameters
self._maximum_depth: int | None = maximum_depth
self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves

def __hash__(self) -> int:
return _structural_hash(Regressor.__hash__(self), self._target_name, self._feature_names)

def __init__(self) -> None:
# Internal state
self._wrapped_regressor: sk_DecisionTreeRegressor | None = None
self._feature_names: list[str] | None = None
self._target_name: str | None = None

def __hash__(self) -> int:
return _structural_hash(
Regressor.__hash__(self),
self._feature_names,
self._target_name,
)

@property
def maximum_depth(self) -> int | None:
"""The maximum depth of the tree."""
return self._maximum_depth

@property
def minimum_number_of_samples_in_leaves(self) -> int:
"""The minimum number of samples that must remain in the leaves of the tree."""
return self._minimum_number_of_samples_in_leaves

def fit(self, training_set: TabularDataset) -> DecisionTreeRegressor:
"""
Create a copy of this regressor and fit it with the given training data.
Expand Down Expand Up @@ -59,7 +109,10 @@ def fit(self, training_set: TabularDataset) -> DecisionTreeRegressor:
wrapped_regressor = self._get_sklearn_regressor()
fit(wrapped_regressor, training_set)

result = DecisionTreeRegressor()
result = DecisionTreeRegressor(
maximum_depth=self._maximum_depth,
minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves,
)
result._wrapped_regressor = wrapped_regressor
result._feature_names = training_set.features.column_names
result._target_name = training_set.target.name
Expand Down Expand Up @@ -113,4 +166,7 @@ def _get_sklearn_regressor(self) -> RegressorMixin:
"""
from sklearn.tree import DecisionTreeRegressor as sk_DecisionTreeRegressor

return sk_DecisionTreeRegressor()
return sk_DecisionTreeRegressor(
max_depth=self._maximum_depth,
min_samples_leaf=self._minimum_number_of_samples_in_leaves,
)
Loading