From ebcb811be12dd0287d6dedfb3e6745a2f0c3f122 Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 24 May 2024 16:34:15 +0200 Subject: [PATCH 01/23] add basic implementation of BaselineClassifier --- .../ml/classical/classification/__init__.py | 3 + .../classification/_baseline_classifier.py | 87 +++++++++++++++++++ .../test_baseline_classifier.py | 21 +++++ 3 files changed, 111 insertions(+) create mode 100644 src/safeds/ml/classical/classification/_baseline_classifier.py create mode 100644 tests/safeds/ml/classical/classification/test_baseline_classifier.py diff --git a/src/safeds/ml/classical/classification/__init__.py b/src/safeds/ml/classical/classification/__init__.py index 6ad258333..7188452cc 100644 --- a/src/safeds/ml/classical/classification/__init__.py +++ b/src/safeds/ml/classical/classification/__init__.py @@ -13,11 +13,13 @@ from ._logistic_classifier import LogisticClassifier from ._random_forest_classifier import RandomForestClassifier from ._support_vector_classifier import SupportVectorClassifier + from ._baseline_classifier import BaselineClassifier apipkg.initpkg( __name__, { "AdaBoostClassifier": "._ada_boost_classifier:AdaBoostClassifier", + "BaselineClassifier": "._baseline_classifier:BaselineClassifier", "Classifier": "._classifier:Classifier", "DecisionTreeClassifier": "._decision_tree_classifier:DecisionTreeClassifier", "GradientBoostingClassifier": "._gradient_boosting_classifier:GradientBoostingClassifier", @@ -30,6 +32,7 @@ __all__ = [ "AdaBoostClassifier", + "BaselineClassifier", "Classifier", "DecisionTreeClassifier", "GradientBoostingClassifier", diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py new file mode 100644 index 000000000..b2028238e --- /dev/null +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -0,0 +1,87 @@ +import copy +from typing import Self + +from safeds.data.labeled.containers import TabularDataset +from safeds.exceptions import ModelNotFittedError +from safeds.ml.classical.classification import Classifier +from safeds.ml.classical.classification import RandomForestClassifier, AdaBoostClassifier, \ + DecisionTreeClassifier, GradientBoostingClassifier, KNearestNeighborsClassifier, SupportVectorClassifier + + +def _fit_single_model(model: Classifier, train_data: TabularDataset) -> Classifier: + return model.fit(train_data) + + +def _predict_single_model(model: Classifier, test_data: TabularDataset) -> TabularDataset: + return model.predict(test_data) + + +class BaselineClassifier: + def __init__(self): + self._is_fitted = False + self._list_of_model_types = [AdaBoostClassifier(), DecisionTreeClassifier(), GradientBoostingClassifier(), + SupportVectorClassifier(), KNearestNeighborsClassifier(2), RandomForestClassifier()] + self._fitted_models = [] + + def fit(self, train_data: TabularDataset) -> Self: + from concurrent.futures import ProcessPoolExecutor + + #Todo Validate data + copied_model = copy.deepcopy(self) + + with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: + futures = [] + for model in self._list_of_model_types: + futures.append(executor.submit(_fit_single_model, model, train_data)) + for future in futures: + copied_model._fitted_models.append(future.result()) + executor.shutdown() + + copied_model._is_fitted = True + return copied_model + + def predict(self, test_data: TabularDataset) -> dict[str, float]: + from concurrent.futures import ProcessPoolExecutor + from safeds.ml.metrics import ClassificationMetrics + + if not self._is_fitted: + raise ModelNotFittedError + + #Todo Validate data + + with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: + results = [] + futures = [] + for model in self._fitted_models: + futures.append(executor.submit(_predict_single_model, model, test_data)) + for future in futures: + results.append(future.result()) + executor.shutdown() + + max_metrics = {"accuracy": 0.0, "f1score": 0.0, "precision": 0.0, "recall": 0.0} + for result in results: + accuracy = ClassificationMetrics.accuracy(result, test_data) + + positive_class = test_data.target.get_value(0) + f1score = ClassificationMetrics.f1_score(result, test_data, positive_class) + precision = ClassificationMetrics.precision(result, test_data, positive_class) + recall = ClassificationMetrics.recall(result, test_data, positive_class) + + if max_metrics.get("accuracy") < accuracy: + max_metrics.update({"accuracy": accuracy}) + + if max_metrics.get("f1score") < f1score: + max_metrics.update({"f1score": f1score}) + + if max_metrics.get("precision") < precision: + max_metrics.update({"precision": precision}) + + if max_metrics.get("recall") < recall: + max_metrics.update({"recall": recall}) + + print(max_metrics) + return max_metrics + + @property + def is_fitted(self) -> bool: + return self._is_fitted diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py new file mode 100644 index 000000000..e5b25dcb3 --- /dev/null +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -0,0 +1,21 @@ +import pytest +from safeds.data.labeled.containers import TabularDataset +from safeds.data.tabular.containers import Table +from safeds.exceptions import OutOfBoundsError +from safeds.ml.classical.classification import AdaBoostClassifier, BaselineClassifier + + +@pytest.fixture() +def training_set() -> TabularDataset: + table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) + return table.to_tabular_dataset(target_name="col1") + + +class TestBaselineClassifier: + + def test_workflow(self, training_set): + classifier = BaselineClassifier() + fitted = classifier.fit(training_set) + fitted.predict(training_set) + assert fitted is not None + From 970021c628cdf6a881dbedea78fdc1d539ad41ed Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 24 May 2024 19:51:38 +0200 Subject: [PATCH 02/23] add basic implementation of BaselineRegressor --- .../ml/classical/regression/__init__.py | 3 + .../regression/_baseline_regressor.py | 83 +++++++++++++++++++ .../test_baseline_classifier.py | 11 ++- .../regression/test_baseline_regressor.py | 29 +++++++ 4 files changed, 124 insertions(+), 2 deletions(-) create mode 100644 src/safeds/ml/classical/regression/_baseline_regressor.py create mode 100644 tests/safeds/ml/classical/regression/test_baseline_regressor.py diff --git a/src/safeds/ml/classical/regression/__init__.py b/src/safeds/ml/classical/regression/__init__.py index ed8c2bcbb..1dd3f627a 100644 --- a/src/safeds/ml/classical/regression/__init__.py +++ b/src/safeds/ml/classical/regression/__init__.py @@ -7,6 +7,7 @@ if TYPE_CHECKING: from ._ada_boost_regressor import AdaBoostRegressor from ._arima import ArimaModelRegressor + from ._baseline_regressor import BaselineRegressor from ._decision_tree_regressor import DecisionTreeRegressor from ._elastic_net_regressor import ElasticNetRegressor from ._gradient_boosting_regressor import GradientBoostingRegressor @@ -23,6 +24,7 @@ { "AdaBoostRegressor": "._ada_boost_regressor:AdaBoostRegressor", "ArimaModelRegressor": "._arima:ArimaModelRegressor", + "BaselineRegressor": "._baseline_regressor:BaselineRegressor", "DecisionTreeRegressor": "._decision_tree_regressor:DecisionTreeRegressor", "ElasticNetRegressor": "._elastic_net_regressor:ElasticNetRegressor", "GradientBoostingRegressor": "._gradient_boosting_regressor:GradientBoostingRegressor", @@ -39,6 +41,7 @@ __all__ = [ "AdaBoostRegressor", "ArimaModelRegressor", + "BaselineRegressor", "DecisionTreeRegressor", "ElasticNetRegressor", "GradientBoostingRegressor", diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py new file mode 100644 index 000000000..c3c278006 --- /dev/null +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -0,0 +1,83 @@ +import copy +from typing import Self + +from safeds.data.labeled.containers import TabularDataset +from safeds.exceptions import ModelNotFittedError +from safeds.ml.classical.regression import AdaBoostRegressor, DecisionTreeRegressor, ElasticNetRegressor, GradientBoostingRegressor, KNearestNeighborsRegressor, LassoRegressor, LinearRegressor, RandomForestRegressor, RidgeRegressor, SupportVectorRegressor +from safeds.ml.classical.regression import Regressor + + +def _fit_single_model(model: Regressor, train_data: TabularDataset) -> Regressor: + return model.fit(train_data) + + +def _predict_single_model(model: Regressor, test_data: TabularDataset) -> TabularDataset: + return model.predict(test_data) + + +class BaselineRegressor: + def __init__(self): + self._is_fitted = False + self._list_of_model_types = [AdaBoostRegressor(), DecisionTreeRegressor(), ElasticNetRegressor(), GradientBoostingRegressor(), KNearestNeighborsRegressor(5), LassoRegressor(), LinearRegressor(), RandomForestRegressor(), RidgeRegressor(), SupportVectorRegressor()] + self._fitted_models = [] + + def fit(self, train_data: TabularDataset) -> Self: + from concurrent.futures import ProcessPoolExecutor + + #Todo Validate data + copied_model = copy.deepcopy(self) + + with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: + futures = [] + for model in self._list_of_model_types: + futures.append(executor.submit(_fit_single_model, model, train_data)) + for future in futures: + copied_model._fitted_models.append(future.result()) + executor.shutdown() + + copied_model._is_fitted = True + return copied_model + + def predict(self, test_data: TabularDataset) -> dict[str, float]: + from concurrent.futures import ProcessPoolExecutor + from safeds.ml.metrics import RegressionMetrics + + if not self._is_fitted: + raise ModelNotFittedError + + #Todo Validate data + + with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: + results = [] + futures = [] + for model in self._fitted_models: + futures.append(executor.submit(_predict_single_model, model, test_data)) + for future in futures: + results.append(future.result()) + executor.shutdown() + + max_metrics = {"coefficient_of_determination": 10000000000000000000000.0, "mean_absolute_error": 10000000000000000000000.0, "mean_squared_error": 10000000000000000000000.0, "median_absolute_deviation": 10000000000000000000000.0} + for result in results: + coefficient_of_determination = RegressionMetrics.coefficient_of_determination(result, test_data) + mean_absolute_error = RegressionMetrics.mean_squared_error(result, test_data) + mean_squared_error = RegressionMetrics.mean_absolute_error(result, test_data) + median_absolute_deviation = RegressionMetrics.median_absolute_deviation(result, test_data) + + if max_metrics.get("coefficient_of_determination") > coefficient_of_determination: + max_metrics.update({"coefficient_of_determination": coefficient_of_determination}) + + if max_metrics.get("mean_absolute_error") > mean_absolute_error: + max_metrics.update({"mean_absolute_error": mean_absolute_error}) + + if max_metrics.get("mean_squared_error") > mean_squared_error: + max_metrics.update({"mean_squared_error": mean_squared_error}) + + if max_metrics.get("median_absolute_deviation") > median_absolute_deviation: + max_metrics.update({"median_absolute_deviation": median_absolute_deviation}) + + print(max_metrics) + return max_metrics + + @property + def is_fitted(self) -> bool: + return self._is_fitted diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index e5b25dcb3..b430e3a3b 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -14,8 +14,15 @@ def training_set() -> TabularDataset: class TestBaselineClassifier: def test_workflow(self, training_set): + input = Table.from_csv_file("D:\\Library_jetzt_aber_wirklich\\src\\safeds\\ml\\classical\\classification\\avocado.csv") + table = input.remove_columns_except(["AveragePrice", "Total Volume", "4046", "4225", "4770", "Total Bags", "Small Bags", "Large Bags", + "type"]) + [train, test] = table.split_rows(0.8) + train = train.to_tabular_dataset(target_name="type") + test = test.to_tabular_dataset(target_name="type") + classifier = BaselineClassifier() - fitted = classifier.fit(training_set) - fitted.predict(training_set) + fitted = classifier.fit(train) + fitted.predict(test) assert fitted is not None diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py new file mode 100644 index 000000000..363227154 --- /dev/null +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -0,0 +1,29 @@ +import pytest +from safeds.data.labeled.containers import TabularDataset +from safeds.data.tabular.containers import Table +from safeds.exceptions import OutOfBoundsError +from safeds.ml.classical.classification import AdaBoostClassifier, BaselineClassifier +from safeds.ml.classical.regression import BaselineRegressor + + +@pytest.fixture() +def training_set() -> TabularDataset: + table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) + return table.to_tabular_dataset(target_name="col1") + + +class TestBaselineRegressor: + + def test_workflow(self, training_set): + input = Table.from_csv_file("D:\\Library_jetzt_aber_wirklich\\src\\safeds\\ml\\classical\\classification\\avocado.csv") + table = input.remove_columns_except(["AveragePrice", "Total Volume", "4046", "4225", "4770", "Total Bags", "Small Bags", "Large Bags", + "type"]) + [train, test] = table.split_rows(0.8) + train = train.to_tabular_dataset(target_name="type") + test = test.to_tabular_dataset(target_name="type") + + regressor = BaselineRegressor() + fitted = regressor.fit(train) + fitted.predict(test) + assert fitted is not None + From 1e78d18fac0f018cdbb9f5ddaf1a2cfb680a4a84 Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 25 May 2024 22:54:05 +0200 Subject: [PATCH 03/23] change stuff to terminate execution after max 30 sec --- .../regression/_baseline_regressor.py | 71 ++++++++++++++++--- .../regression/test_baseline_regressor.py | 14 ++-- 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index c3c278006..f6b8a2fbf 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -1,8 +1,12 @@ import copy +import time +from concurrent.futures import as_completed, FIRST_COMPLETED, wait from typing import Self from safeds.data.labeled.containers import TabularDataset -from safeds.exceptions import ModelNotFittedError +from safeds.data.tabular.containers import Table +from safeds.data.tabular.transformation import StandardScaler +from safeds.exceptions import ModelNotFittedError, NonNumericColumnError, DatasetMissesDataError from safeds.ml.classical.regression import AdaBoostRegressor, DecisionTreeRegressor, ElasticNetRegressor, GradientBoostingRegressor, KNearestNeighborsRegressor, LassoRegressor, LinearRegressor, RandomForestRegressor, RidgeRegressor, SupportVectorRegressor from safeds.ml.classical.regression import Regressor @@ -18,20 +22,40 @@ def _predict_single_model(model: Regressor, test_data: TabularDataset) -> Tabula class BaselineRegressor: def __init__(self): self._is_fitted = False - self._list_of_model_types = [AdaBoostRegressor(), DecisionTreeRegressor(), ElasticNetRegressor(), GradientBoostingRegressor(), KNearestNeighborsRegressor(5), LassoRegressor(), LinearRegressor(), RandomForestRegressor(), RidgeRegressor(), SupportVectorRegressor()] + #self._list_of_model_types = [AdaBoostRegressor(), DecisionTreeRegressor(), ElasticNetRegressor(), GradientBoostingRegressor(), KNearestNeighborsRegressor(5), LassoRegressor(), LinearRegressor(), RandomForestRegressor(), RidgeRegressor(), SupportVectorRegressor()] + self._list_of_model_types = [AdaBoostRegressor(), DecisionTreeRegressor(), + GradientBoostingRegressor(), KNearestNeighborsRegressor(5), + RandomForestRegressor(), + SupportVectorRegressor()] + self._fitted_models = [] + self._feature_names = None def fit(self, train_data: TabularDataset) -> Self: from concurrent.futures import ProcessPoolExecutor - #Todo Validate data + #Remove Invalid Data + #target_name = train_data.target.name + #table = train_data.to_table() + #table = table.remove_non_numeric_columns() + #table = table.remove_rows_with_missing_values() + #table = table.remove_duplicate_rows() + + #if table.row_count == 0: + # raise TODO Decide which Error to Raise, as DatasetMissesDataError might confuse Users as we might remove some data ourselfs + + # Scale features + #train_data = table.to_tabular_dataset(target_name) + train_data = self._standard_scale_tabular_dataset(train_data) + copied_model = copy.deepcopy(self) with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: futures = [] for model in self._list_of_model_types: futures.append(executor.submit(_fit_single_model, model, train_data)) - for future in futures: + [done, _] = wait(futures, timeout=10) + for future in done: copied_model._fitted_models.append(future.result()) executor.shutdown() @@ -45,25 +69,28 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: if not self._is_fitted: raise ModelNotFittedError - #Todo Validate data + # Todo Validate data + test_data = self._standard_scale_tabular_dataset(test_data) + with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: results = [] futures = [] for model in self._fitted_models: futures.append(executor.submit(_predict_single_model, model, test_data)) - for future in futures: + [done, _] = wait(futures, timeout=20, return_when=FIRST_COMPLETED) + for future in done: results.append(future.result()) executor.shutdown() - max_metrics = {"coefficient_of_determination": 10000000000000000000000.0, "mean_absolute_error": 10000000000000000000000.0, "mean_squared_error": 10000000000000000000000.0, "median_absolute_deviation": 10000000000000000000000.0} + max_metrics = {"coefficient_of_determination": float('-inf'), "mean_absolute_error": float('inf'), "mean_squared_error": float('inf'), "median_absolute_deviation": float('inf')} for result in results: coefficient_of_determination = RegressionMetrics.coefficient_of_determination(result, test_data) - mean_absolute_error = RegressionMetrics.mean_squared_error(result, test_data) - mean_squared_error = RegressionMetrics.mean_absolute_error(result, test_data) + mean_absolute_error = RegressionMetrics.mean_absolute_error(result, test_data) + mean_squared_error = RegressionMetrics.mean_squared_error(result, test_data) median_absolute_deviation = RegressionMetrics.median_absolute_deviation(result, test_data) - if max_metrics.get("coefficient_of_determination") > coefficient_of_determination: + if max_metrics.get("coefficient_of_determination") < coefficient_of_determination: max_metrics.update({"coefficient_of_determination": coefficient_of_determination}) if max_metrics.get("mean_absolute_error") > mean_absolute_error: @@ -75,9 +102,31 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: if max_metrics.get("median_absolute_deviation") > median_absolute_deviation: max_metrics.update({"median_absolute_deviation": median_absolute_deviation}) - print(max_metrics) + print(Table( + { + "best metrics achieved": [ + "coefficient_of_determination", + "mean_absolute_error", + "mean_squared_error", + "median_absolute_deviation", + ], + "value": [ + max_metrics.get("coefficient_of_determination"), + max_metrics.get("mean_absolute_error"), + max_metrics.get("mean_squared_error"), + max_metrics.get("median_absolute_deviation"), + ], + }, + )) return max_metrics @property def is_fitted(self) -> bool: return self._is_fitted + + def _standard_scale_tabular_dataset(self, data: TabularDataset) -> TabularDataset: + target = data.target + ss = StandardScaler() + [_, scaled_features] = ss.fit_and_transform(data.features) + return scaled_features.add_columns([target]).to_tabular_dataset(target.name) + diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index 363227154..e26aa0a36 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -1,9 +1,10 @@ import pytest from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.containers import Table +from safeds.data.tabular.transformation import LabelEncoder, StandardScaler from safeds.exceptions import OutOfBoundsError from safeds.ml.classical.classification import AdaBoostClassifier, BaselineClassifier -from safeds.ml.classical.regression import BaselineRegressor +from safeds.ml.classical.regression import BaselineRegressor, ElasticNetRegressor, LassoRegressor, LinearRegressor @pytest.fixture() @@ -15,12 +16,13 @@ def training_set() -> TabularDataset: class TestBaselineRegressor: def test_workflow(self, training_set): - input = Table.from_csv_file("D:\\Library_jetzt_aber_wirklich\\src\\safeds\\ml\\classical\\classification\\avocado.csv") - table = input.remove_columns_except(["AveragePrice", "Total Volume", "4046", "4225", "4770", "Total Bags", "Small Bags", "Large Bags", - "type"]) + input = Table.from_csv_file("D:\\Library_jetzt_aber_wirklich\\src\\safeds\\ml\\classical\\regression\\houses.csv") + table = input.remove_columns(["id", "lat", "long", "zipcode", "condition", "grade", "date"]) + #TODO Not scaling the data makes the Regressor take 10 Minutes instead of 20 Seconds + [train, test] = table.split_rows(0.8) - train = train.to_tabular_dataset(target_name="type") - test = test.to_tabular_dataset(target_name="type") + train = train.to_tabular_dataset(target_name="price") + test = test.to_tabular_dataset(target_name="price") regressor = BaselineRegressor() fitted = regressor.fit(train) From 608f1f64ceefbc933bb0fabb62a2f95a75483b53 Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 31 May 2024 14:45:31 +0200 Subject: [PATCH 04/23] Add data validation --- src/safeds/exceptions/_ml.py | 6 +- .../classification/_baseline_classifier.py | 55 +++++++++++-- .../regression/_baseline_regressor.py | 77 ++++++++++--------- .../regression/test_baseline_regressor.py | 19 +++-- 4 files changed, 104 insertions(+), 53 deletions(-) diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py index d84395485..7105b7cc9 100644 --- a/src/safeds/exceptions/_ml.py +++ b/src/safeds/exceptions/_ml.py @@ -65,16 +65,16 @@ def __init__(self, reason: str): class FeatureDataMismatchError(Exception): - """Raised when the columns of the table passed to the predict or fit method do not match with the specified features of the neural network.""" + """Raised when the columns of the table passed to the predict or fit method do not match with the specified features of the model.""" def __init__(self) -> None: super().__init__( - "The features in the given table do not match with the specified feature columns names of the neural network.", + "The features in the given table do not match with the specified feature columns names of model.", ) class InputSizeError(Exception): - """Raised when the amount of features being passed to a network does not match with its input size.""" + """Raised when the amount of features being passed to a model does not match with its input size.""" def __init__(self, data_size: int | ModelImageSize, input_layer_size: int | ModelImageSize | None) -> None: # TODO: remove input_layer_size type None again diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index b2028238e..826698473 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -1,8 +1,11 @@ import copy +from concurrent.futures import wait, ALL_COMPLETED, ProcessPoolExecutor from typing import Self +from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset -from safeds.exceptions import ModelNotFittedError +from safeds.data.tabular.containers import Table +from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, FeatureDataMismatchError from safeds.ml.classical.classification import Classifier from safeds.ml.classical.classification import RandomForestClassifier, AdaBoostClassifier, \ DecisionTreeClassifier, GradientBoostingClassifier, KNearestNeighborsClassifier, SupportVectorClassifier @@ -17,37 +20,60 @@ def _predict_single_model(model: Classifier, test_data: TabularDataset) -> Tabul class BaselineClassifier: - def __init__(self): + def __init__(self, include_slower_models: bool = False): self._is_fitted = False - self._list_of_model_types = [AdaBoostClassifier(), DecisionTreeClassifier(), GradientBoostingClassifier(), - SupportVectorClassifier(), KNearestNeighborsClassifier(2), RandomForestClassifier()] + self._list_of_model_types = [AdaBoostClassifier(), DecisionTreeClassifier(), + SupportVectorClassifier(), RandomForestClassifier()] + # TODO maybe add KNearestNeighbors to extended models + if include_slower_models: + self._list_of_model_types.extend([GradientBoostingClassifier()]) + self._fitted_models = [] + self._feature_names = None + self._target_name = None def fit(self, train_data: TabularDataset) -> Self: from concurrent.futures import ProcessPoolExecutor - #Todo Validate data + # Validate Data + train_data_as_table = train_data.to_table() + if train_data_as_table.row_count == 0: + raise DatasetMissesDataError + _check_columns_are_numeric(train_data_as_table, train_data.features.add_columns(train_data.target).column_names) + copied_model = copy.deepcopy(self) with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: futures = [] for model in self._list_of_model_types: futures.append(executor.submit(_fit_single_model, model, train_data)) + [done, _] = wait(futures, return_when=ALL_COMPLETED) for future in futures: copied_model._fitted_models.append(future.result()) executor.shutdown() copied_model._is_fitted = True + copied_model._feature_names = train_data.features.column_names + copied_model._target_name = train_data.target.name return copied_model def predict(self, test_data: TabularDataset) -> dict[str, float]: + # TODO Think about combining fit and predict into one method from concurrent.futures import ProcessPoolExecutor from safeds.ml.metrics import ClassificationMetrics if not self._is_fitted: raise ModelNotFittedError - #Todo Validate data + # Validate data + if not self._feature_names == test_data.features.column_names: + raise FeatureDataMismatchError + # if not self._target_name == test_data.target.name: + # raise TODO Create new Error for this Case? + test_data_as_table = test_data.to_table() + if test_data_as_table.row_count == 0: + raise DatasetMissesDataError + _check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names) with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: results = [] @@ -79,7 +105,22 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: if max_metrics.get("recall") < recall: max_metrics.update({"recall": recall}) - print(max_metrics) + print(Table( + { + "Metric": [ + "accuracy", + "f1score", + "precision", + "recall", + ], + "Best value": [ + max_metrics.get("accuracy"), + max_metrics.get("f1score"), + max_metrics.get("precision"), + max_metrics.get("recall"), + ], + }, + )) return max_metrics @property diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index f6b8a2fbf..f757f19f1 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -1,13 +1,16 @@ import copy import time -from concurrent.futures import as_completed, FIRST_COMPLETED, wait +from concurrent.futures import as_completed, FIRST_COMPLETED, wait, ALL_COMPLETED from typing import Self +from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import StandardScaler -from safeds.exceptions import ModelNotFittedError, NonNumericColumnError, DatasetMissesDataError -from safeds.ml.classical.regression import AdaBoostRegressor, DecisionTreeRegressor, ElasticNetRegressor, GradientBoostingRegressor, KNearestNeighborsRegressor, LassoRegressor, LinearRegressor, RandomForestRegressor, RidgeRegressor, SupportVectorRegressor +from safeds.exceptions import ModelNotFittedError, NonNumericColumnError, DatasetMissesDataError, \ + FeatureDataMismatchError +from safeds.ml.classical.regression import AdaBoostRegressor, DecisionTreeRegressor, ElasticNetRegressor, \ + GradientBoostingRegressor, KNearestNeighborsRegressor, LassoRegressor, LinearRegressor, RandomForestRegressor, \ + RidgeRegressor, SupportVectorRegressor from safeds.ml.classical.regression import Regressor @@ -20,33 +23,28 @@ def _predict_single_model(model: Regressor, test_data: TabularDataset) -> Tabula class BaselineRegressor: - def __init__(self): + def __init__(self, include_slower_models: bool = False): self._is_fitted = False - #self._list_of_model_types = [AdaBoostRegressor(), DecisionTreeRegressor(), ElasticNetRegressor(), GradientBoostingRegressor(), KNearestNeighborsRegressor(5), LassoRegressor(), LinearRegressor(), RandomForestRegressor(), RidgeRegressor(), SupportVectorRegressor()] + #TODO maybe add KNearestNeighbors self._list_of_model_types = [AdaBoostRegressor(), DecisionTreeRegressor(), - GradientBoostingRegressor(), KNearestNeighborsRegressor(5), - RandomForestRegressor(), + LinearRegressor(), RandomForestRegressor(), RidgeRegressor(), SupportVectorRegressor()] + if include_slower_models: + self._list_of_model_types.extend([ElasticNetRegressor(), LassoRegressor(), GradientBoostingRegressor()]) + self._fitted_models = [] self._feature_names = None + self._target_name = None def fit(self, train_data: TabularDataset) -> Self: from concurrent.futures import ProcessPoolExecutor - #Remove Invalid Data - #target_name = train_data.target.name - #table = train_data.to_table() - #table = table.remove_non_numeric_columns() - #table = table.remove_rows_with_missing_values() - #table = table.remove_duplicate_rows() - - #if table.row_count == 0: - # raise TODO Decide which Error to Raise, as DatasetMissesDataError might confuse Users as we might remove some data ourselfs - - # Scale features - #train_data = table.to_tabular_dataset(target_name) - train_data = self._standard_scale_tabular_dataset(train_data) + #Validate Data + train_data_as_table = train_data.to_table() + if train_data_as_table.row_count == 0: + raise DatasetMissesDataError + _check_columns_are_numeric(train_data_as_table, train_data.features.add_columns(train_data.target).column_names) copied_model = copy.deepcopy(self) @@ -54,36 +52,48 @@ def fit(self, train_data: TabularDataset) -> Self: futures = [] for model in self._list_of_model_types: futures.append(executor.submit(_fit_single_model, model, train_data)) - [done, _] = wait(futures, timeout=10) + [done, _] = wait(futures, return_when=ALL_COMPLETED) for future in done: copied_model._fitted_models.append(future.result()) executor.shutdown() copied_model._is_fitted = True + copied_model._feature_names = train_data.features.column_names + copied_model._target_name = train_data.target.name return copied_model def predict(self, test_data: TabularDataset) -> dict[str, float]: + #TODO Think about combining fit and predict into one method from concurrent.futures import ProcessPoolExecutor from safeds.ml.metrics import RegressionMetrics if not self._is_fitted: raise ModelNotFittedError - # Todo Validate data - test_data = self._standard_scale_tabular_dataset(test_data) - - + # Validate data + if not self._feature_names == test_data.features.column_names: + raise FeatureDataMismatchError + #if not self._target_name == test_data.target.name: + # raise TODO Create new Error for this Case? + test_data_as_table = test_data.to_table() + if test_data_as_table.row_count == 0: + raise DatasetMissesDataError + _check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names) + + # Start Processes with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: results = [] futures = [] for model in self._fitted_models: futures.append(executor.submit(_predict_single_model, model, test_data)) - [done, _] = wait(futures, timeout=20, return_when=FIRST_COMPLETED) + [done, _] = wait(futures, return_when=ALL_COMPLETED) for future in done: results.append(future.result()) executor.shutdown() - max_metrics = {"coefficient_of_determination": float('-inf'), "mean_absolute_error": float('inf'), "mean_squared_error": float('inf'), "median_absolute_deviation": float('inf')} + # Calculate Metrics + max_metrics = {"coefficient_of_determination": float('-inf'), "mean_absolute_error": float('inf'), + "mean_squared_error": float('inf'), "median_absolute_deviation": float('inf')} for result in results: coefficient_of_determination = RegressionMetrics.coefficient_of_determination(result, test_data) mean_absolute_error = RegressionMetrics.mean_absolute_error(result, test_data) @@ -104,13 +114,13 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: print(Table( { - "best metrics achieved": [ + "Metric": [ "coefficient_of_determination", "mean_absolute_error", "mean_squared_error", "median_absolute_deviation", ], - "value": [ + "Best value": [ max_metrics.get("coefficient_of_determination"), max_metrics.get("mean_absolute_error"), max_metrics.get("mean_squared_error"), @@ -123,10 +133,3 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: @property def is_fitted(self) -> bool: return self._is_fitted - - def _standard_scale_tabular_dataset(self, data: TabularDataset) -> TabularDataset: - target = data.target - ss = StandardScaler() - [_, scaled_features] = ss.fit_and_transform(data.features) - return scaled_features.add_columns([target]).to_tabular_dataset(target.name) - diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index e26aa0a36..ffa948ca6 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -1,10 +1,8 @@ import pytest from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import LabelEncoder, StandardScaler -from safeds.exceptions import OutOfBoundsError -from safeds.ml.classical.classification import AdaBoostClassifier, BaselineClassifier -from safeds.ml.classical.regression import BaselineRegressor, ElasticNetRegressor, LassoRegressor, LinearRegressor +from safeds.data.tabular.transformation import StandardScaler +from safeds.ml.classical.regression import BaselineRegressor @pytest.fixture() @@ -16,16 +14,25 @@ def training_set() -> TabularDataset: class TestBaselineRegressor: def test_workflow(self, training_set): + import time input = Table.from_csv_file("D:\\Library_jetzt_aber_wirklich\\src\\safeds\\ml\\classical\\regression\\houses.csv") table = input.remove_columns(["id", "lat", "long", "zipcode", "condition", "grade", "date"]) #TODO Not scaling the data makes the Regressor take 10 Minutes instead of 20 Seconds + target = table.get_column("price") + ss = StandardScaler(column_names=table.column_names.remove("price")) + [_, scaled_features] = ss.fit_and_transform(table.remove_columns(["price"])) + table = scaled_features.add_columns([target]) [train, test] = table.split_rows(0.8) train = train.to_tabular_dataset(target_name="price") test = test.to_tabular_dataset(target_name="price") - regressor = BaselineRegressor() + start_time = time.time() + regressor = BaselineRegressor(include_slower_models=False) fitted = regressor.fit(train) - fitted.predict(test) + results = fitted.predict(test) + end_time = time.time() + + print(f"Time needed: {end_time-start_time}") assert fitted is not None From 91db191eaea1a0a50406731d0ca0ed157e5c2381 Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 31 May 2024 16:09:42 +0200 Subject: [PATCH 05/23] Add tests --- .../classification/_baseline_classifier.py | 5 +- .../test_baseline_classifier.py | 77 ++++++++++++---- .../regression/test_baseline_regressor.py | 87 ++++++++++++------- 3 files changed, 119 insertions(+), 50 deletions(-) diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index 826698473..cb934c68d 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -48,7 +48,7 @@ def fit(self, train_data: TabularDataset) -> Self: for model in self._list_of_model_types: futures.append(executor.submit(_fit_single_model, model, train_data)) [done, _] = wait(futures, return_when=ALL_COMPLETED) - for future in futures: + for future in done: copied_model._fitted_models.append(future.result()) executor.shutdown() @@ -80,7 +80,8 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: futures = [] for model in self._fitted_models: futures.append(executor.submit(_predict_single_model, model, test_data)) - for future in futures: + [done, _] = wait(futures, return_when=ALL_COMPLETED) + for future in done: results.append(future.result()) executor.shutdown() diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index b430e3a3b..b79279bff 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -1,28 +1,67 @@ import pytest -from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.containers import Table -from safeds.exceptions import OutOfBoundsError -from safeds.ml.classical.classification import AdaBoostClassifier, BaselineClassifier +from safeds.exceptions import DatasetMissesDataError, ColumnTypeError, FeatureDataMismatchError +from safeds.ml.classical.classification import BaselineClassifier -@pytest.fixture() -def training_set() -> TabularDataset: - table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1") +#TODO To test predict cases, we have to fit the model first which takes a couple seconds each time. Find a way to +#TODO only fit a model once and pass it to all predict test cases. +class TestBaselineClassifier: + def test_should_raise_if_fit_dataset_contains_no_data(self): + model = BaselineClassifier() + data = Table({"feat": [], "target": []}).to_tabular_dataset("target") + with pytest.raises(DatasetMissesDataError): + model.fit(data) + def test_should_raise_if_predict_dataset_contains_no_data(self): + model = BaselineClassifier() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"feat": [], "target": []}).to_tabular_dataset("target") + model = model.fit(fit_data) + with pytest.raises(DatasetMissesDataError): + model.predict(predict_data) -class TestBaselineClassifier: + def test_should_raise_if_fit_dataset_contains_non_numerical_columns(self) -> None: + model = BaselineClassifier() + data = Table({"feat": ["a", "b"], "target": [0, 1]}).to_tabular_dataset("target") + with pytest.raises(ColumnTypeError): + model.fit(data) + + def test_should_raise_if_predict_dataset_contains_non_numerical_columns(self): + model = BaselineClassifier() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"feat": ["zero", "one"], "target": [0, 1]}).to_tabular_dataset("target") + model = model.fit(fit_data) + with pytest.raises(ColumnTypeError): + model.predict(predict_data) + + def test_should_check_that_fit_returns_baseline_classifier(self) -> None: + model = BaselineClassifier() + data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + assert type(model.fit(data)) == BaselineClassifier - def test_workflow(self, training_set): - input = Table.from_csv_file("D:\\Library_jetzt_aber_wirklich\\src\\safeds\\ml\\classical\\classification\\avocado.csv") - table = input.remove_columns_except(["AveragePrice", "Total Volume", "4046", "4225", "4770", "Total Bags", "Small Bags", "Large Bags", - "type"]) - [train, test] = table.split_rows(0.8) - train = train.to_tabular_dataset(target_name="type") - test = test.to_tabular_dataset(target_name="type") + def test_should_raise_if_is_fitted_is_set_correctly(self) -> None: + model = BaselineClassifier() + data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + assert not model.is_fitted + model = model.fit(data) + assert model.is_fitted - classifier = BaselineClassifier() - fitted = classifier.fit(train) - fitted.predict(test) - assert fitted is not None + def test_should_raise_if_predict_data_has_differing_features(self) -> None: + model = BaselineClassifier() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"other": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + model = model.fit(fit_data) + with pytest.raises(FeatureDataMismatchError): + model.predict(predict_data) + def test_check_predict_return_type_and_values(self) -> None: + model = BaselineClassifier() + data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + model = model.fit(data) + result = model.predict(data) + assert isinstance(result, dict) + assert result.get("accuracy") >= 0.0 + assert result.get("f1score") >= 0.0 + assert result.get("precision") >= 0.0 + assert result.get("recall") >= 0.0 diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index ffa948ca6..2cb24a0a5 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -1,38 +1,67 @@ import pytest -from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import StandardScaler +from safeds.exceptions import DatasetMissesDataError, ColumnTypeError, FeatureDataMismatchError from safeds.ml.classical.regression import BaselineRegressor -@pytest.fixture() -def training_set() -> TabularDataset: - table = Table({"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]}) - return table.to_tabular_dataset(target_name="col1") +# TODO To test predict cases, we have to fit the model first which takes a couple seconds each time. Find a way to +# TODO only fit a model once and pass it to all predict test cases. +class TestBaselineRegressor: + def test_should_raise_if_fit_dataset_contains_no_data(self): + model = BaselineRegressor() + data = Table({"feat": [], "target": []}).to_tabular_dataset("target") + with pytest.raises(DatasetMissesDataError): + model.fit(data) + def test_should_raise_if_predict_dataset_contains_no_data(self): + model = BaselineRegressor() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"feat": [], "target": []}).to_tabular_dataset("target") + model = model.fit(fit_data) + with pytest.raises(DatasetMissesDataError): + model.predict(predict_data) -class TestBaselineRegressor: + def test_should_raise_if_fit_dataset_contains_non_numerical_columns(self) -> None: + model = BaselineRegressor() + data = Table({"feat": ["a", "b"], "target": [0, 1]}).to_tabular_dataset("target") + with pytest.raises(ColumnTypeError): + model.fit(data) + + def test_should_raise_if_predict_dataset_contains_non_numerical_columns(self): + model = BaselineRegressor() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"feat": ["zero", "one"], "target": [0, 1]}).to_tabular_dataset("target") + model = model.fit(fit_data) + with pytest.raises(ColumnTypeError): + model.predict(predict_data) + + def test_should_check_that_fit_returns_baseline_classifier(self) -> None: + model = BaselineRegressor() + data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + assert type(model.fit(data)) == BaselineRegressor + + def test_should_raise_if_is_fitted_is_set_correctly(self) -> None: + model = BaselineRegressor() + data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + assert not model.is_fitted + model = model.fit(data) + assert model.is_fitted - def test_workflow(self, training_set): - import time - input = Table.from_csv_file("D:\\Library_jetzt_aber_wirklich\\src\\safeds\\ml\\classical\\regression\\houses.csv") - table = input.remove_columns(["id", "lat", "long", "zipcode", "condition", "grade", "date"]) - #TODO Not scaling the data makes the Regressor take 10 Minutes instead of 20 Seconds - target = table.get_column("price") - ss = StandardScaler(column_names=table.column_names.remove("price")) - [_, scaled_features] = ss.fit_and_transform(table.remove_columns(["price"])) - table = scaled_features.add_columns([target]) - - [train, test] = table.split_rows(0.8) - train = train.to_tabular_dataset(target_name="price") - test = test.to_tabular_dataset(target_name="price") - - start_time = time.time() - regressor = BaselineRegressor(include_slower_models=False) - fitted = regressor.fit(train) - results = fitted.predict(test) - end_time = time.time() - - print(f"Time needed: {end_time-start_time}") - assert fitted is not None + def test_should_raise_if_predict_data_has_differing_features(self) -> None: + model = BaselineRegressor() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"other": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + model = model.fit(fit_data) + with pytest.raises(FeatureDataMismatchError): + model.predict(predict_data) + def test_check_predict_return_type_and_values(self) -> None: + model = BaselineRegressor() + data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + model = model.fit(data) + result = model.predict(data) + assert isinstance(result, dict) + assert result.get("coefficient_of_determination") >= float("-inf") + assert result.get("mean_absolute_error") <= float("inf") + assert result.get("mean_squared_error") <= float("inf") + assert result.get("median_absolute_deviation") <= float("inf") From eab2a39b0043e77bf500e2e4463584e2b1e079c7 Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 1 Jun 2024 17:53:39 +0200 Subject: [PATCH 06/23] Linter fixes --- .../classification/_baseline_classifier.py | 21 ++---------- .../regression/_baseline_regressor.py | 32 ++++--------------- .../test_baseline_classifier.py | 2 +- .../regression/test_baseline_regressor.py | 2 +- 4 files changed, 11 insertions(+), 46 deletions(-) diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index cb934c68d..566ce15e5 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -1,14 +1,13 @@ import copy -from concurrent.futures import wait, ALL_COMPLETED, ProcessPoolExecutor +from concurrent.futures import wait, ALL_COMPLETED from typing import Self from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, FeatureDataMismatchError from safeds.ml.classical.classification import Classifier from safeds.ml.classical.classification import RandomForestClassifier, AdaBoostClassifier, \ - DecisionTreeClassifier, GradientBoostingClassifier, KNearestNeighborsClassifier, SupportVectorClassifier + DecisionTreeClassifier, GradientBoostingClassifier, SupportVectorClassifier def _fit_single_model(model: Classifier, train_data: TabularDataset) -> Classifier: @@ -106,22 +105,6 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: if max_metrics.get("recall") < recall: max_metrics.update({"recall": recall}) - print(Table( - { - "Metric": [ - "accuracy", - "f1score", - "precision", - "recall", - ], - "Best value": [ - max_metrics.get("accuracy"), - max_metrics.get("f1score"), - max_metrics.get("precision"), - max_metrics.get("recall"), - ], - }, - )) return max_metrics @property diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index f757f19f1..3e601c070 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -1,15 +1,13 @@ import copy -import time -from concurrent.futures import as_completed, FIRST_COMPLETED, wait, ALL_COMPLETED -from typing import Self +from concurrent.futures import wait, ALL_COMPLETED +from typing import Self, List from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table -from safeds.exceptions import ModelNotFittedError, NonNumericColumnError, DatasetMissesDataError, \ +from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, \ FeatureDataMismatchError from safeds.ml.classical.regression import AdaBoostRegressor, DecisionTreeRegressor, ElasticNetRegressor, \ - GradientBoostingRegressor, KNearestNeighborsRegressor, LassoRegressor, LinearRegressor, RandomForestRegressor, \ + GradientBoostingRegressor, LassoRegressor, LinearRegressor, RandomForestRegressor, \ RidgeRegressor, SupportVectorRegressor from safeds.ml.classical.regression import Regressor @@ -33,9 +31,9 @@ def __init__(self, include_slower_models: bool = False): if include_slower_models: self._list_of_model_types.extend([ElasticNetRegressor(), LassoRegressor(), GradientBoostingRegressor()]) - self._fitted_models = [] - self._feature_names = None - self._target_name = None + self._fitted_models: List[Regressor] = [] + self._feature_names: List[str] | None = None + self._target_name: str | None = None def fit(self, train_data: TabularDataset) -> Self: from concurrent.futures import ProcessPoolExecutor @@ -112,22 +110,6 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: if max_metrics.get("median_absolute_deviation") > median_absolute_deviation: max_metrics.update({"median_absolute_deviation": median_absolute_deviation}) - print(Table( - { - "Metric": [ - "coefficient_of_determination", - "mean_absolute_error", - "mean_squared_error", - "median_absolute_deviation", - ], - "Best value": [ - max_metrics.get("coefficient_of_determination"), - max_metrics.get("mean_absolute_error"), - max_metrics.get("mean_squared_error"), - max_metrics.get("median_absolute_deviation"), - ], - }, - )) return max_metrics @property diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index b79279bff..311fb4ae5 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -38,7 +38,7 @@ def test_should_raise_if_predict_dataset_contains_non_numerical_columns(self): def test_should_check_that_fit_returns_baseline_classifier(self) -> None: model = BaselineClassifier() data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") - assert type(model.fit(data)) == BaselineClassifier + assert isinstance(model.fit(data), BaselineClassifier) def test_should_raise_if_is_fitted_is_set_correctly(self) -> None: model = BaselineClassifier() diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index 2cb24a0a5..0665d3a76 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -38,7 +38,7 @@ def test_should_raise_if_predict_dataset_contains_non_numerical_columns(self): def test_should_check_that_fit_returns_baseline_classifier(self) -> None: model = BaselineRegressor() data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") - assert type(model.fit(data)) == BaselineRegressor + assert isinstance(model.fit(data), BaselineRegressor) def test_should_raise_if_is_fitted_is_set_correctly(self) -> None: model = BaselineRegressor() From 4b6edd72144ccbd6c3fb0997e7ee12ad8d84fc6e Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 1 Jun 2024 18:33:16 +0200 Subject: [PATCH 07/23] Linter fixes --- .../classification/_baseline_classifier.py | 16 ++++++++-------- .../classical/regression/_baseline_regressor.py | 8 ++++---- .../classification/test_baseline_classifier.py | 6 +++--- .../regression/test_baseline_regressor.py | 14 +++++++------- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index 566ce15e5..a5262e25e 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -1,6 +1,6 @@ import copy from concurrent.futures import wait, ALL_COMPLETED -from typing import Self +from typing import Self, List from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset @@ -27,9 +27,9 @@ def __init__(self, include_slower_models: bool = False): if include_slower_models: self._list_of_model_types.extend([GradientBoostingClassifier()]) - self._fitted_models = [] - self._feature_names = None - self._target_name = None + self._fitted_models: List[Classifier] = [] + self._feature_names: List[str] | None = None + self._target_name: str | None = None def fit(self, train_data: TabularDataset) -> Self: from concurrent.futures import ProcessPoolExecutor @@ -93,16 +93,16 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: precision = ClassificationMetrics.precision(result, test_data, positive_class) recall = ClassificationMetrics.recall(result, test_data, positive_class) - if max_metrics.get("accuracy") < accuracy: + if max_metrics.get("accuracy") is not None and max_metrics.get("accuracy") < accuracy: max_metrics.update({"accuracy": accuracy}) - if max_metrics.get("f1score") < f1score: + if max_metrics.get("f1score") is not None and max_metrics.get("f1score") < f1score: max_metrics.update({"f1score": f1score}) - if max_metrics.get("precision") < precision: + if max_metrics.get("precision") is not None and max_metrics.get("precision") < precision: max_metrics.update({"precision": precision}) - if max_metrics.get("recall") < recall: + if max_metrics.get("recall") is not None and max_metrics.get("recall") < recall: max_metrics.update({"recall": recall}) return max_metrics diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index 3e601c070..337adec95 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -98,16 +98,16 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: mean_squared_error = RegressionMetrics.mean_squared_error(result, test_data) median_absolute_deviation = RegressionMetrics.median_absolute_deviation(result, test_data) - if max_metrics.get("coefficient_of_determination") < coefficient_of_determination: + if max_metrics.get("coefficient_of_determination") is not None and max_metrics.get("coefficient_of_determination") < coefficient_of_determination: max_metrics.update({"coefficient_of_determination": coefficient_of_determination}) - if max_metrics.get("mean_absolute_error") > mean_absolute_error: + if max_metrics.get("mean_absolute_error") is not None and max_metrics.get("mean_absolute_error") > mean_absolute_error: max_metrics.update({"mean_absolute_error": mean_absolute_error}) - if max_metrics.get("mean_squared_error") > mean_squared_error: + if max_metrics.get("mean_squared_error") is not None and max_metrics.get("mean_squared_error") > mean_squared_error: max_metrics.update({"mean_squared_error": mean_squared_error}) - if max_metrics.get("median_absolute_deviation") > median_absolute_deviation: + if max_metrics.get("median_absolute_deviation") is not None and max_metrics.get("median_absolute_deviation") > median_absolute_deviation: max_metrics.update({"median_absolute_deviation": median_absolute_deviation}) return max_metrics diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index 311fb4ae5..882d9cf3d 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -7,13 +7,13 @@ #TODO To test predict cases, we have to fit the model first which takes a couple seconds each time. Find a way to #TODO only fit a model once and pass it to all predict test cases. class TestBaselineClassifier: - def test_should_raise_if_fit_dataset_contains_no_data(self): + def test_should_raise_if_fit_dataset_contains_no_data(self) -> None: model = BaselineClassifier() data = Table({"feat": [], "target": []}).to_tabular_dataset("target") with pytest.raises(DatasetMissesDataError): model.fit(data) - def test_should_raise_if_predict_dataset_contains_no_data(self): + def test_should_raise_if_predict_dataset_contains_no_data(self) -> None: model = BaselineClassifier() fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") predict_data = Table({"feat": [], "target": []}).to_tabular_dataset("target") @@ -27,7 +27,7 @@ def test_should_raise_if_fit_dataset_contains_non_numerical_columns(self) -> Non with pytest.raises(ColumnTypeError): model.fit(data) - def test_should_raise_if_predict_dataset_contains_non_numerical_columns(self): + def test_should_raise_if_predict_dataset_contains_non_numerical_columns(self) -> None: model = BaselineClassifier() fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") predict_data = Table({"feat": ["zero", "one"], "target": [0, 1]}).to_tabular_dataset("target") diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index 0665d3a76..1d418431a 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -7,13 +7,13 @@ # TODO To test predict cases, we have to fit the model first which takes a couple seconds each time. Find a way to # TODO only fit a model once and pass it to all predict test cases. class TestBaselineRegressor: - def test_should_raise_if_fit_dataset_contains_no_data(self): + def test_should_raise_if_fit_dataset_contains_no_data(self) -> None: model = BaselineRegressor() data = Table({"feat": [], "target": []}).to_tabular_dataset("target") with pytest.raises(DatasetMissesDataError): model.fit(data) - def test_should_raise_if_predict_dataset_contains_no_data(self): + def test_should_raise_if_predict_dataset_contains_no_data(self) -> None: model = BaselineRegressor() fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") predict_data = Table({"feat": [], "target": []}).to_tabular_dataset("target") @@ -27,7 +27,7 @@ def test_should_raise_if_fit_dataset_contains_non_numerical_columns(self) -> Non with pytest.raises(ColumnTypeError): model.fit(data) - def test_should_raise_if_predict_dataset_contains_non_numerical_columns(self): + def test_should_raise_if_predict_dataset_contains_non_numerical_columns(self) -> None: model = BaselineRegressor() fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") predict_data = Table({"feat": ["zero", "one"], "target": [0, 1]}).to_tabular_dataset("target") @@ -61,7 +61,7 @@ def test_check_predict_return_type_and_values(self) -> None: model = model.fit(data) result = model.predict(data) assert isinstance(result, dict) - assert result.get("coefficient_of_determination") >= float("-inf") - assert result.get("mean_absolute_error") <= float("inf") - assert result.get("mean_squared_error") <= float("inf") - assert result.get("median_absolute_deviation") <= float("inf") + assert result.get("coefficient_of_determination") is not None and result.get("coefficient_of_determination") >= float("-inf") + assert result.get("mean_absolute_error") is not None and result.get("mean_absolute_error") <= float("inf") + assert result.get("mean_squared_error") is not None and result.get("mean_squared_error") <= float("inf") + assert result.get("median_absolute_deviation") is not None and result.get("median_absolute_deviation") <= float("inf") From ab6296e35d97006d01e56baca2a60f697fcb963a Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 1 Jun 2024 20:34:57 +0200 Subject: [PATCH 08/23] add docs --- .../classification/_baseline_classifier.py | 66 +++++++++++++++++-- .../regression/_baseline_regressor.py | 58 +++++++++++++++- 2 files changed, 118 insertions(+), 6 deletions(-) diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index a5262e25e..8e1fd3bc7 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -7,7 +7,7 @@ from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, FeatureDataMismatchError from safeds.ml.classical.classification import Classifier from safeds.ml.classical.classification import RandomForestClassifier, AdaBoostClassifier, \ - DecisionTreeClassifier, GradientBoostingClassifier, SupportVectorClassifier + DecisionTreeClassifier, GradientBoostingClassifier, SupportVectorClassifier def _fit_single_model(model: Classifier, train_data: TabularDataset) -> Classifier: @@ -19,12 +19,19 @@ def _predict_single_model(model: Classifier, test_data: TabularDataset) -> Tabul class BaselineClassifier: - def __init__(self, include_slower_models: bool = False): + """ + Baseline Classifier. + + Get a baseline by fitting data on multiple different models and comparing the best metrics. + + Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the + classifier. This might result in significantly higher runtime. + """ + def __init__(self, extended_search: bool = False): self._is_fitted = False self._list_of_model_types = [AdaBoostClassifier(), DecisionTreeClassifier(), - SupportVectorClassifier(), RandomForestClassifier()] - # TODO maybe add KNearestNeighbors to extended models - if include_slower_models: + SupportVectorClassifier(), RandomForestClassifier()] + if extended_search: self._list_of_model_types.extend([GradientBoostingClassifier()]) self._fitted_models: List[Classifier] = [] @@ -32,6 +39,28 @@ def __init__(self, include_slower_models: bool = False): self._target_name: str | None = None def fit(self, train_data: TabularDataset) -> Self: + """ + Train the Classifier with given training data. + + The original model is not modified. + + Parameters + ---------- + train_data: + The data the network should be trained on. + + Returns + ------- + trained_classifier: + The trained Classifier + + Raises + ------ + DatasetMissesDataError + If the given train_data contains no data. + ColumnTypeError + If one or more columns contain non-numeric values. + """ from concurrent.futures import ProcessPoolExecutor # Validate Data @@ -57,6 +86,32 @@ def fit(self, train_data: TabularDataset) -> Self: return copied_model def predict(self, test_data: TabularDataset) -> dict[str, float]: + """ + Make a prediction for the given test data and calculate the best metrics. + + The original Model is not modified. + + Parameters + ---------- + test_data: + The data the Classifier should predict. + + Returns + ------- + best_metrics: + A dictionary with the best metrics that were achieved. + + Raises + ------ + ModelNotFittedError + If the model has not been fitted yet + FeatureDataMismatchError + If the features of the test data do not match with the features of the trained Classifier. + DatasetMissesDataError + If the given test_data contains no data. + ColumnTypeError + If one or more columns contain non-numeric values. + """ # TODO Think about combining fit and predict into one method from concurrent.futures import ProcessPoolExecutor from safeds.ml.metrics import ClassificationMetrics @@ -109,4 +164,5 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: @property def is_fitted(self) -> bool: + """Whether the model is fitted.""" return self._is_fitted diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index 337adec95..8ed655821 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -21,9 +21,16 @@ def _predict_single_model(model: Regressor, test_data: TabularDataset) -> Tabula class BaselineRegressor: + """ + Baseline Regressor. + + Get a baseline by fitting data on multiple different models and comparing the best metrics. + + Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the + classifier. This might result in significantly higher runtime. + """ def __init__(self, include_slower_models: bool = False): self._is_fitted = False - #TODO maybe add KNearestNeighbors self._list_of_model_types = [AdaBoostRegressor(), DecisionTreeRegressor(), LinearRegressor(), RandomForestRegressor(), RidgeRegressor(), SupportVectorRegressor()] @@ -36,6 +43,28 @@ def __init__(self, include_slower_models: bool = False): self._target_name: str | None = None def fit(self, train_data: TabularDataset) -> Self: + """ + Train the Regressor with given training data. + + The original model is not modified. + + Parameters + ---------- + train_data: + The data the network should be trained on. + + Returns + ------- + trained_classifier: + The trained Regressor + + Raises + ------ + DatasetMissesDataError + If the given train_data contains no data. + ColumnTypeError + If one or more columns contain non-numeric values. + """ from concurrent.futures import ProcessPoolExecutor #Validate Data @@ -61,6 +90,32 @@ def fit(self, train_data: TabularDataset) -> Self: return copied_model def predict(self, test_data: TabularDataset) -> dict[str, float]: + """ + Make a prediction for the given test data and calculate the best metrics. + + The original Model is not modified. + + Parameters + ---------- + test_data: + The data the Regressor should predict. + + Returns + ------- + best_metrics: + A dictionary with the best metrics that were achieved. + + Raises + ------ + ModelNotFittedError + If the model has not been fitted yet + FeatureDataMismatchError + If the features of the test data do not match with the features of the trained Regressor. + DatasetMissesDataError + If the given test_data contains no data. + ColumnTypeError + If one or more columns contain non-numeric values. + """ #TODO Think about combining fit and predict into one method from concurrent.futures import ProcessPoolExecutor from safeds.ml.metrics import RegressionMetrics @@ -114,4 +169,5 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: @property def is_fitted(self) -> bool: + """Whether the model is fitted.""" return self._is_fitted From a6e306163e7ea80760a2e9715055d54bcfc3705b Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 1 Jun 2024 22:28:10 +0200 Subject: [PATCH 09/23] linter fixes --- .../classification/test_baseline_classifier.py | 4 ++++ .../classical/regression/test_baseline_regressor.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index 882d9cf3d..f3d09c260 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -61,6 +61,10 @@ def test_check_predict_return_type_and_values(self) -> None: model = model.fit(data) result = model.predict(data) assert isinstance(result, dict) + assert result.get("accuracy") is not None + assert result.get("f1score") is not None + assert result.get("precision") is not None + assert result.get("recall") is not None assert result.get("accuracy") >= 0.0 assert result.get("f1score") >= 0.0 assert result.get("precision") >= 0.0 diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index 1d418431a..7145fdea6 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -61,7 +61,11 @@ def test_check_predict_return_type_and_values(self) -> None: model = model.fit(data) result = model.predict(data) assert isinstance(result, dict) - assert result.get("coefficient_of_determination") is not None and result.get("coefficient_of_determination") >= float("-inf") - assert result.get("mean_absolute_error") is not None and result.get("mean_absolute_error") <= float("inf") - assert result.get("mean_squared_error") is not None and result.get("mean_squared_error") <= float("inf") - assert result.get("median_absolute_deviation") is not None and result.get("median_absolute_deviation") <= float("inf") + assert result.get("coefficient_of_determination") is not None + assert result.get("mean_absolute_error") is not None + assert result.get("mean_squared_error") is not None + assert result.get("median_absolute_deviation") is not None + assert result.get("coefficient_of_determination") >= float("-inf") + assert result.get("mean_absolute_error") <= float("inf") + assert result.get("mean_squared_error") <= float("inf") + assert result.get("median_absolute_deviation") <= float("inf") From f6b1002a08e0d648888575133ec9ab8fd72ddd69 Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 1 Jun 2024 22:40:49 +0200 Subject: [PATCH 10/23] linter fixes --- src/safeds/ml/classical/regression/_baseline_regressor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index 8ed655821..d35178365 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -153,8 +153,9 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: mean_squared_error = RegressionMetrics.mean_squared_error(result, test_data) median_absolute_deviation = RegressionMetrics.median_absolute_deviation(result, test_data) - if max_metrics.get("coefficient_of_determination") is not None and max_metrics.get("coefficient_of_determination") < coefficient_of_determination: - max_metrics.update({"coefficient_of_determination": coefficient_of_determination}) + if max_metrics.get("coefficient_of_determination") is not None: + if max_metrics.get("coefficient_of_determination") < coefficient_of_determination: + max_metrics.update({"coefficient_of_determination": coefficient_of_determination}) if max_metrics.get("mean_absolute_error") is not None and max_metrics.get("mean_absolute_error") > mean_absolute_error: max_metrics.update({"mean_absolute_error": mean_absolute_error}) From 6af2d0b51b291b6659c46243c75653fbe818c6c5 Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 1 Jun 2024 22:53:22 +0200 Subject: [PATCH 11/23] linter fixes --- .../classical/classification/_baseline_classifier.py | 8 ++++---- .../ml/classical/regression/_baseline_regressor.py | 11 +++++------ .../classification/test_baseline_classifier.py | 12 ++++-------- .../classical/regression/test_baseline_regressor.py | 12 ++++-------- 4 files changed, 17 insertions(+), 26 deletions(-) diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index 8e1fd3bc7..7dd57226d 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -148,16 +148,16 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: precision = ClassificationMetrics.precision(result, test_data, positive_class) recall = ClassificationMetrics.recall(result, test_data, positive_class) - if max_metrics.get("accuracy") is not None and max_metrics.get("accuracy") < accuracy: + if max_metrics.get("accuracy", 0.0) < accuracy: max_metrics.update({"accuracy": accuracy}) - if max_metrics.get("f1score") is not None and max_metrics.get("f1score") < f1score: + if max_metrics.get("f1score", 0.0) < f1score: max_metrics.update({"f1score": f1score}) - if max_metrics.get("precision") is not None and max_metrics.get("precision") < precision: + if max_metrics.get("precision", 0.0) < precision: max_metrics.update({"precision": precision}) - if max_metrics.get("recall") is not None and max_metrics.get("recall") < recall: + if max_metrics.get("recall", 0.0) < recall: max_metrics.update({"recall": recall}) return max_metrics diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index d35178365..467ec03f9 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -153,17 +153,16 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: mean_squared_error = RegressionMetrics.mean_squared_error(result, test_data) median_absolute_deviation = RegressionMetrics.median_absolute_deviation(result, test_data) - if max_metrics.get("coefficient_of_determination") is not None: - if max_metrics.get("coefficient_of_determination") < coefficient_of_determination: - max_metrics.update({"coefficient_of_determination": coefficient_of_determination}) + if max_metrics.get("coefficient_of_determination", float('-inf')) < coefficient_of_determination: + max_metrics.update({"coefficient_of_determination": coefficient_of_determination}) - if max_metrics.get("mean_absolute_error") is not None and max_metrics.get("mean_absolute_error") > mean_absolute_error: + if max_metrics.get("mean_absolute_error", float('inf')) > mean_absolute_error: max_metrics.update({"mean_absolute_error": mean_absolute_error}) - if max_metrics.get("mean_squared_error") is not None and max_metrics.get("mean_squared_error") > mean_squared_error: + if max_metrics.get("mean_squared_error", float('inf')) > mean_squared_error: max_metrics.update({"mean_squared_error": mean_squared_error}) - if max_metrics.get("median_absolute_deviation") is not None and max_metrics.get("median_absolute_deviation") > median_absolute_deviation: + if max_metrics.get("median_absolute_deviation", float('inf')) > median_absolute_deviation: max_metrics.update({"median_absolute_deviation": median_absolute_deviation}) return max_metrics diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index f3d09c260..8b48ae850 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -61,11 +61,7 @@ def test_check_predict_return_type_and_values(self) -> None: model = model.fit(data) result = model.predict(data) assert isinstance(result, dict) - assert result.get("accuracy") is not None - assert result.get("f1score") is not None - assert result.get("precision") is not None - assert result.get("recall") is not None - assert result.get("accuracy") >= 0.0 - assert result.get("f1score") >= 0.0 - assert result.get("precision") >= 0.0 - assert result.get("recall") >= 0.0 + assert result.get("accuracy", 0.0) >= 0.0 + assert result.get("f1score", 0.0) >= 0.0 + assert result.get("precision", 0.0) >= 0.0 + assert result.get("recall", 0.0) >= 0.0 diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index 7145fdea6..ee6dbc1ec 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -61,11 +61,7 @@ def test_check_predict_return_type_and_values(self) -> None: model = model.fit(data) result = model.predict(data) assert isinstance(result, dict) - assert result.get("coefficient_of_determination") is not None - assert result.get("mean_absolute_error") is not None - assert result.get("mean_squared_error") is not None - assert result.get("median_absolute_deviation") is not None - assert result.get("coefficient_of_determination") >= float("-inf") - assert result.get("mean_absolute_error") <= float("inf") - assert result.get("mean_squared_error") <= float("inf") - assert result.get("median_absolute_deviation") <= float("inf") + assert result.get("coefficient_of_determination", float('-inf')) >= float("-inf") + assert result.get("mean_absolute_error", float('inf')) <= float("inf") + assert result.get("mean_squared_error", float('inf')) <= float("inf") + assert result.get("median_absolute_deviation", float('inf')) <= float("inf") From a1b415385e390a9a6883a280d8d10f81feeda178 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Sat, 1 Jun 2024 20:55:23 +0000 Subject: [PATCH 12/23] style: apply automated linter fixes --- .../ml/classical/classification/__init__.py | 2 +- .../classification/_baseline_classifier.py | 31 +++++++--- .../regression/_baseline_regressor.py | 62 ++++++++++++------- .../test_baseline_classifier.py | 6 +- .../regression/test_baseline_regressor.py | 10 +-- 5 files changed, 70 insertions(+), 41 deletions(-) diff --git a/src/safeds/ml/classical/classification/__init__.py b/src/safeds/ml/classical/classification/__init__.py index 7188452cc..cff7ecbc2 100644 --- a/src/safeds/ml/classical/classification/__init__.py +++ b/src/safeds/ml/classical/classification/__init__.py @@ -6,6 +6,7 @@ if TYPE_CHECKING: from ._ada_boost_classifier import AdaBoostClassifier + from ._baseline_classifier import BaselineClassifier from ._classifier import Classifier from ._decision_tree_classifier import DecisionTreeClassifier from ._gradient_boosting_classifier import GradientBoostingClassifier @@ -13,7 +14,6 @@ from ._logistic_classifier import LogisticClassifier from ._random_forest_classifier import RandomForestClassifier from ._support_vector_classifier import SupportVectorClassifier - from ._baseline_classifier import BaselineClassifier apipkg.initpkg( __name__, diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index 7dd57226d..4d060e5a6 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -1,13 +1,18 @@ import copy -from concurrent.futures import wait, ALL_COMPLETED -from typing import Self, List +from concurrent.futures import ALL_COMPLETED, wait +from typing import Self from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset -from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, FeatureDataMismatchError -from safeds.ml.classical.classification import Classifier -from safeds.ml.classical.classification import RandomForestClassifier, AdaBoostClassifier, \ - DecisionTreeClassifier, GradientBoostingClassifier, SupportVectorClassifier +from safeds.exceptions import DatasetMissesDataError, FeatureDataMismatchError, ModelNotFittedError +from safeds.ml.classical.classification import ( + AdaBoostClassifier, + Classifier, + DecisionTreeClassifier, + GradientBoostingClassifier, + RandomForestClassifier, + SupportVectorClassifier, +) def _fit_single_model(model: Classifier, train_data: TabularDataset) -> Classifier: @@ -27,15 +32,20 @@ class BaselineClassifier: Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the classifier. This might result in significantly higher runtime. """ + def __init__(self, extended_search: bool = False): self._is_fitted = False - self._list_of_model_types = [AdaBoostClassifier(), DecisionTreeClassifier(), - SupportVectorClassifier(), RandomForestClassifier()] + self._list_of_model_types = [ + AdaBoostClassifier(), + DecisionTreeClassifier(), + SupportVectorClassifier(), + RandomForestClassifier(), + ] if extended_search: self._list_of_model_types.extend([GradientBoostingClassifier()]) - self._fitted_models: List[Classifier] = [] - self._feature_names: List[str] | None = None + self._fitted_models: list[Classifier] = [] + self._feature_names: list[str] | None = None self._target_name: str | None = None def fit(self, train_data: TabularDataset) -> Self: @@ -114,6 +124,7 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: """ # TODO Think about combining fit and predict into one method from concurrent.futures import ProcessPoolExecutor + from safeds.ml.metrics import ClassificationMetrics if not self._is_fitted: diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index 467ec03f9..e878eae77 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -1,15 +1,22 @@ import copy -from concurrent.futures import wait, ALL_COMPLETED -from typing import Self, List +from concurrent.futures import ALL_COMPLETED, wait +from typing import Self from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset -from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, \ - FeatureDataMismatchError -from safeds.ml.classical.regression import AdaBoostRegressor, DecisionTreeRegressor, ElasticNetRegressor, \ - GradientBoostingRegressor, LassoRegressor, LinearRegressor, RandomForestRegressor, \ - RidgeRegressor, SupportVectorRegressor -from safeds.ml.classical.regression import Regressor +from safeds.exceptions import DatasetMissesDataError, FeatureDataMismatchError, ModelNotFittedError +from safeds.ml.classical.regression import ( + AdaBoostRegressor, + DecisionTreeRegressor, + ElasticNetRegressor, + GradientBoostingRegressor, + LassoRegressor, + LinearRegressor, + RandomForestRegressor, + Regressor, + RidgeRegressor, + SupportVectorRegressor, +) def _fit_single_model(model: Regressor, train_data: TabularDataset) -> Regressor: @@ -29,17 +36,23 @@ class BaselineRegressor: Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the classifier. This might result in significantly higher runtime. """ + def __init__(self, include_slower_models: bool = False): self._is_fitted = False - self._list_of_model_types = [AdaBoostRegressor(), DecisionTreeRegressor(), - LinearRegressor(), RandomForestRegressor(), RidgeRegressor(), - SupportVectorRegressor()] + self._list_of_model_types = [ + AdaBoostRegressor(), + DecisionTreeRegressor(), + LinearRegressor(), + RandomForestRegressor(), + RidgeRegressor(), + SupportVectorRegressor(), + ] if include_slower_models: self._list_of_model_types.extend([ElasticNetRegressor(), LassoRegressor(), GradientBoostingRegressor()]) - self._fitted_models: List[Regressor] = [] - self._feature_names: List[str] | None = None + self._fitted_models: list[Regressor] = [] + self._feature_names: list[str] | None = None self._target_name: str | None = None def fit(self, train_data: TabularDataset) -> Self: @@ -67,7 +80,7 @@ def fit(self, train_data: TabularDataset) -> Self: """ from concurrent.futures import ProcessPoolExecutor - #Validate Data + # Validate Data train_data_as_table = train_data.to_table() if train_data_as_table.row_count == 0: raise DatasetMissesDataError @@ -116,8 +129,9 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: ColumnTypeError If one or more columns contain non-numeric values. """ - #TODO Think about combining fit and predict into one method + # TODO Think about combining fit and predict into one method from concurrent.futures import ProcessPoolExecutor + from safeds.ml.metrics import RegressionMetrics if not self._is_fitted: @@ -126,7 +140,7 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: # Validate data if not self._feature_names == test_data.features.column_names: raise FeatureDataMismatchError - #if not self._target_name == test_data.target.name: + # if not self._target_name == test_data.target.name: # raise TODO Create new Error for this Case? test_data_as_table = test_data.to_table() if test_data_as_table.row_count == 0: @@ -145,24 +159,28 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: executor.shutdown() # Calculate Metrics - max_metrics = {"coefficient_of_determination": float('-inf'), "mean_absolute_error": float('inf'), - "mean_squared_error": float('inf'), "median_absolute_deviation": float('inf')} + max_metrics = { + "coefficient_of_determination": float("-inf"), + "mean_absolute_error": float("inf"), + "mean_squared_error": float("inf"), + "median_absolute_deviation": float("inf"), + } for result in results: coefficient_of_determination = RegressionMetrics.coefficient_of_determination(result, test_data) mean_absolute_error = RegressionMetrics.mean_absolute_error(result, test_data) mean_squared_error = RegressionMetrics.mean_squared_error(result, test_data) median_absolute_deviation = RegressionMetrics.median_absolute_deviation(result, test_data) - if max_metrics.get("coefficient_of_determination", float('-inf')) < coefficient_of_determination: + if max_metrics.get("coefficient_of_determination", float("-inf")) < coefficient_of_determination: max_metrics.update({"coefficient_of_determination": coefficient_of_determination}) - if max_metrics.get("mean_absolute_error", float('inf')) > mean_absolute_error: + if max_metrics.get("mean_absolute_error", float("inf")) > mean_absolute_error: max_metrics.update({"mean_absolute_error": mean_absolute_error}) - if max_metrics.get("mean_squared_error", float('inf')) > mean_squared_error: + if max_metrics.get("mean_squared_error", float("inf")) > mean_squared_error: max_metrics.update({"mean_squared_error": mean_squared_error}) - if max_metrics.get("median_absolute_deviation", float('inf')) > median_absolute_deviation: + if max_metrics.get("median_absolute_deviation", float("inf")) > median_absolute_deviation: max_metrics.update({"median_absolute_deviation": median_absolute_deviation}) return max_metrics diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index 8b48ae850..b290d2d31 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -1,11 +1,11 @@ import pytest from safeds.data.tabular.containers import Table -from safeds.exceptions import DatasetMissesDataError, ColumnTypeError, FeatureDataMismatchError +from safeds.exceptions import ColumnTypeError, DatasetMissesDataError, FeatureDataMismatchError from safeds.ml.classical.classification import BaselineClassifier -#TODO To test predict cases, we have to fit the model first which takes a couple seconds each time. Find a way to -#TODO only fit a model once and pass it to all predict test cases. +# TODO To test predict cases, we have to fit the model first which takes a couple seconds each time. Find a way to +# TODO only fit a model once and pass it to all predict test cases. class TestBaselineClassifier: def test_should_raise_if_fit_dataset_contains_no_data(self) -> None: model = BaselineClassifier() diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index ee6dbc1ec..09b8d78ae 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -1,6 +1,6 @@ import pytest from safeds.data.tabular.containers import Table -from safeds.exceptions import DatasetMissesDataError, ColumnTypeError, FeatureDataMismatchError +from safeds.exceptions import ColumnTypeError, DatasetMissesDataError, FeatureDataMismatchError from safeds.ml.classical.regression import BaselineRegressor @@ -61,7 +61,7 @@ def test_check_predict_return_type_and_values(self) -> None: model = model.fit(data) result = model.predict(data) assert isinstance(result, dict) - assert result.get("coefficient_of_determination", float('-inf')) >= float("-inf") - assert result.get("mean_absolute_error", float('inf')) <= float("inf") - assert result.get("mean_squared_error", float('inf')) <= float("inf") - assert result.get("median_absolute_deviation", float('inf')) <= float("inf") + assert result.get("coefficient_of_determination", float("-inf")) >= float("-inf") + assert result.get("mean_absolute_error", float("inf")) <= float("inf") + assert result.get("mean_squared_error", float("inf")) <= float("inf") + assert result.get("median_absolute_deviation", float("inf")) <= float("inf") From 6baff710ed5c99cadc3a9ce072b5eea2f2e60d79 Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 1 Jun 2024 23:36:01 +0200 Subject: [PATCH 13/23] Add DatasetMissesTargetError --- src/safeds/exceptions/__init__.py | 3 ++- src/safeds/exceptions/_ml.py | 13 +++++++++++++ .../classification/_baseline_classifier.py | 8 ++++---- .../ml/classical/regression/_baseline_regressor.py | 9 +++++---- .../classification/test_baseline_classifier.py | 11 ++++++++++- .../classical/regression/test_baseline_regressor.py | 12 +++++++++++- 6 files changed, 45 insertions(+), 11 deletions(-) diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py index 8f1e9de6d..a767d59e7 100644 --- a/src/safeds/exceptions/__init__.py +++ b/src/safeds/exceptions/__init__.py @@ -22,7 +22,7 @@ LearningError, ModelNotFittedError, PlainTableError, - PredictionError, + PredictionError, DatasetMissesTargetError, ) @@ -68,6 +68,7 @@ class OutOfBoundsError(SafeDsError): # ML exceptions "DatasetMissesDataError", "DatasetMissesFeaturesError", + "DatasetMissesTargetError", "FeatureDataMismatchError", "InputSizeError", "InvalidModelStructureError", diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py index 7105b7cc9..1aabbf1bb 100644 --- a/src/safeds/exceptions/_ml.py +++ b/src/safeds/exceptions/_ml.py @@ -14,6 +14,19 @@ class DatasetMissesFeaturesError(ValueError): def __init__(self, missing_feature_names: list[str]): super().__init__(f"Dataset misses the feature columns '{missing_feature_names}'.") +class DatasetMissesTargetError(ValueError): + """ + Raised when a dataset misses the target column. + + Parameters + ---------- + missing_target_name: + The names of the missing target column. + """ + + def __init__(self, missing_target_name: str): + super().__init__(f"Dataset misses the target column '{missing_target_name}'.") + class DatasetMissesDataError(ValueError): """Raised when a dataset contains no rows.""" diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index 7dd57226d..f26af3aed 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -4,7 +4,8 @@ from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset -from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, FeatureDataMismatchError +from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, FeatureDataMismatchError, \ + DatasetMissesTargetError from safeds.ml.classical.classification import Classifier from safeds.ml.classical.classification import RandomForestClassifier, AdaBoostClassifier, \ DecisionTreeClassifier, GradientBoostingClassifier, SupportVectorClassifier @@ -112,7 +113,6 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: ColumnTypeError If one or more columns contain non-numeric values. """ - # TODO Think about combining fit and predict into one method from concurrent.futures import ProcessPoolExecutor from safeds.ml.metrics import ClassificationMetrics @@ -122,8 +122,8 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: # Validate data if not self._feature_names == test_data.features.column_names: raise FeatureDataMismatchError - # if not self._target_name == test_data.target.name: - # raise TODO Create new Error for this Case? + if not self._target_name == test_data.target.name: + raise DatasetMissesTargetError(self._target_name) test_data_as_table = test_data.to_table() if test_data_as_table.row_count == 0: raise DatasetMissesDataError diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index 467ec03f9..8506e1b4a 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -5,7 +5,7 @@ from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, \ - FeatureDataMismatchError + FeatureDataMismatchError, DatasetMissesTargetError from safeds.ml.classical.regression import AdaBoostRegressor, DecisionTreeRegressor, ElasticNetRegressor, \ GradientBoostingRegressor, LassoRegressor, LinearRegressor, RandomForestRegressor, \ RidgeRegressor, SupportVectorRegressor @@ -113,10 +113,11 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: If the features of the test data do not match with the features of the trained Regressor. DatasetMissesDataError If the given test_data contains no data. + DatasetMissesTargetError + If the given test_data misses the target column. ColumnTypeError If one or more columns contain non-numeric values. """ - #TODO Think about combining fit and predict into one method from concurrent.futures import ProcessPoolExecutor from safeds.ml.metrics import RegressionMetrics @@ -126,8 +127,8 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: # Validate data if not self._feature_names == test_data.features.column_names: raise FeatureDataMismatchError - #if not self._target_name == test_data.target.name: - # raise TODO Create new Error for this Case? + if not self._target_name == test_data.target.name: + raise DatasetMissesTargetError(self._target_name) test_data_as_table = test_data.to_table() if test_data_as_table.row_count == 0: raise DatasetMissesDataError diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index 8b48ae850..782f8563f 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -1,6 +1,7 @@ import pytest from safeds.data.tabular.containers import Table -from safeds.exceptions import DatasetMissesDataError, ColumnTypeError, FeatureDataMismatchError +from safeds.exceptions import DatasetMissesDataError, ColumnTypeError, FeatureDataMismatchError, \ + DatasetMissesTargetError from safeds.ml.classical.classification import BaselineClassifier @@ -55,6 +56,14 @@ def test_should_raise_if_predict_data_has_differing_features(self) -> None: with pytest.raises(FeatureDataMismatchError): model.predict(predict_data) + def test_should_raise_if_predict_data_misses_target_column(self) -> None: + model = BaselineClassifier() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"feat": [0, 1], "other": [0, 1]}).to_tabular_dataset("other") + model = model.fit(fit_data) + with pytest.raises(DatasetMissesTargetError): + model.predict(predict_data) + def test_check_predict_return_type_and_values(self) -> None: model = BaselineClassifier() data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index ee6dbc1ec..fc9264f9b 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -1,6 +1,7 @@ import pytest from safeds.data.tabular.containers import Table -from safeds.exceptions import DatasetMissesDataError, ColumnTypeError, FeatureDataMismatchError +from safeds.exceptions import DatasetMissesDataError, ColumnTypeError, FeatureDataMismatchError, \ + DatasetMissesTargetError from safeds.ml.classical.regression import BaselineRegressor @@ -55,6 +56,15 @@ def test_should_raise_if_predict_data_has_differing_features(self) -> None: with pytest.raises(FeatureDataMismatchError): model.predict(predict_data) + + def test_should_raise_if_predict_data_misses_target_column(self) -> None: + model = BaselineRegressor() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"feat": [0, 1], "other": [0, 1]}).to_tabular_dataset("other") + model = model.fit(fit_data) + with pytest.raises(DatasetMissesTargetError): + model.predict(predict_data) + def test_check_predict_return_type_and_values(self) -> None: model = BaselineRegressor() data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") From da67f73c583b5db81c13b34718f854fd07b26782 Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 1 Jun 2024 23:41:18 +0200 Subject: [PATCH 14/23] add docs --- src/safeds/ml/classical/classification/_baseline_classifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index 2a84fec76..70ddc3638 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -124,11 +124,12 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: If the features of the test data do not match with the features of the trained Classifier. DatasetMissesDataError If the given test_data contains no data. + DatasetMissesTargetError + If the given test_data misses the target column. ColumnTypeError If one or more columns contain non-numeric values. """ from concurrent.futures import ProcessPoolExecutor - from safeds.ml.metrics import ClassificationMetrics if not self._is_fitted: From 8e12811e502ac6a9763c0d9bbc59c2f5d2f1d980 Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 1 Jun 2024 23:45:58 +0200 Subject: [PATCH 15/23] linter fix --- .../classification/_baseline_classifier.py | 11 +---------- .../classical/regression/_baseline_regressor.py | 15 +-------------- .../classification/test_baseline_classifier.py | 1 - .../regression/test_baseline_regressor.py | 3 +-- 4 files changed, 3 insertions(+), 27 deletions(-) diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index 70ddc3638..17537f50a 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -4,15 +4,6 @@ from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset -from safeds.exceptions import DatasetMissesDataError, FeatureDataMismatchError, ModelNotFittedError -from safeds.ml.classical.classification import ( - AdaBoostClassifier, - Classifier, - DecisionTreeClassifier, - GradientBoostingClassifier, - RandomForestClassifier, - SupportVectorClassifier, -) from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, FeatureDataMismatchError, \ DatasetMissesTargetError from safeds.ml.classical.classification import Classifier @@ -51,7 +42,7 @@ def __init__(self, extended_search: bool = False): self._fitted_models: list[Classifier] = [] self._feature_names: list[str] | None = None - self._target_name: str | None = None + self._target_name: str = "none" def fit(self, train_data: TabularDataset) -> Self: """ diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index bb2c38df1..35508cfc5 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -4,19 +4,6 @@ from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset -from safeds.exceptions import DatasetMissesDataError, FeatureDataMismatchError, ModelNotFittedError -from safeds.ml.classical.regression import ( - AdaBoostRegressor, - DecisionTreeRegressor, - ElasticNetRegressor, - GradientBoostingRegressor, - LassoRegressor, - LinearRegressor, - RandomForestRegressor, - Regressor, - RidgeRegressor, - SupportVectorRegressor, -) from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, \ FeatureDataMismatchError, DatasetMissesTargetError from safeds.ml.classical.regression import AdaBoostRegressor, DecisionTreeRegressor, ElasticNetRegressor, \ @@ -59,7 +46,7 @@ def __init__(self, include_slower_models: bool = False): self._fitted_models: list[Regressor] = [] self._feature_names: list[str] | None = None - self._target_name: str | None = None + self._target_name: str = "none" def fit(self, train_data: TabularDataset) -> Self: """ diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index add9dce54..996e32c0a 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -1,6 +1,5 @@ import pytest from safeds.data.tabular.containers import Table -from safeds.exceptions import ColumnTypeError, DatasetMissesDataError, FeatureDataMismatchError from safeds.exceptions import DatasetMissesDataError, ColumnTypeError, FeatureDataMismatchError, \ DatasetMissesTargetError from safeds.ml.classical.classification import BaselineClassifier diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index 77d038ad6..35fefb75e 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -1,7 +1,6 @@ import pytest from safeds.data.tabular.containers import Table -from safeds.exceptions import DatasetMissesDataError, ColumnTypeError, FeatureDataMismatchError, \ - DatasetMissesTargetError +from safeds.exceptions import DatasetMissesTargetError from safeds.exceptions import ColumnTypeError, DatasetMissesDataError, FeatureDataMismatchError from safeds.ml.classical.regression import BaselineRegressor From a62c6179154509d5ae408489aeb5d53cacd1a828 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Sat, 1 Jun 2024 21:47:41 +0000 Subject: [PATCH 16/23] style: apply automated linter fixes --- src/safeds/exceptions/__init__.py | 3 ++- src/safeds/exceptions/_ml.py | 1 + .../classification/_baseline_classifier.py | 20 ++++++++++++---- .../regression/_baseline_regressor.py | 24 ++++++++++++++----- .../test_baseline_classifier.py | 8 +++++-- .../regression/test_baseline_regressor.py | 9 ++++--- 6 files changed, 48 insertions(+), 17 deletions(-) diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py index a767d59e7..65ee819d8 100644 --- a/src/safeds/exceptions/__init__.py +++ b/src/safeds/exceptions/__init__.py @@ -16,13 +16,14 @@ from ._ml import ( DatasetMissesDataError, DatasetMissesFeaturesError, + DatasetMissesTargetError, FeatureDataMismatchError, InputSizeError, InvalidModelStructureError, LearningError, ModelNotFittedError, PlainTableError, - PredictionError, DatasetMissesTargetError, + PredictionError, ) diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py index 1aabbf1bb..ded5a3678 100644 --- a/src/safeds/exceptions/_ml.py +++ b/src/safeds/exceptions/_ml.py @@ -14,6 +14,7 @@ class DatasetMissesFeaturesError(ValueError): def __init__(self, missing_feature_names: list[str]): super().__init__(f"Dataset misses the feature columns '{missing_feature_names}'.") + class DatasetMissesTargetError(ValueError): """ Raised when a dataset misses the target column. diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index 17537f50a..f38043d20 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -4,11 +4,20 @@ from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset -from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, FeatureDataMismatchError, \ - DatasetMissesTargetError -from safeds.ml.classical.classification import Classifier -from safeds.ml.classical.classification import RandomForestClassifier, AdaBoostClassifier, \ - DecisionTreeClassifier, GradientBoostingClassifier, SupportVectorClassifier +from safeds.exceptions import ( + DatasetMissesDataError, + DatasetMissesTargetError, + FeatureDataMismatchError, + ModelNotFittedError, +) +from safeds.ml.classical.classification import ( + AdaBoostClassifier, + Classifier, + DecisionTreeClassifier, + GradientBoostingClassifier, + RandomForestClassifier, + SupportVectorClassifier, +) def _fit_single_model(model: Classifier, train_data: TabularDataset) -> Classifier: @@ -121,6 +130,7 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: If one or more columns contain non-numeric values. """ from concurrent.futures import ProcessPoolExecutor + from safeds.ml.metrics import ClassificationMetrics if not self._is_fitted: diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index 35508cfc5..f83d0c294 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -4,12 +4,24 @@ from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.labeled.containers import TabularDataset -from safeds.exceptions import ModelNotFittedError, DatasetMissesDataError, \ - FeatureDataMismatchError, DatasetMissesTargetError -from safeds.ml.classical.regression import AdaBoostRegressor, DecisionTreeRegressor, ElasticNetRegressor, \ - GradientBoostingRegressor, LassoRegressor, LinearRegressor, RandomForestRegressor, \ - RidgeRegressor, SupportVectorRegressor -from safeds.ml.classical.regression import Regressor +from safeds.exceptions import ( + DatasetMissesDataError, + DatasetMissesTargetError, + FeatureDataMismatchError, + ModelNotFittedError, +) +from safeds.ml.classical.regression import ( + AdaBoostRegressor, + DecisionTreeRegressor, + ElasticNetRegressor, + GradientBoostingRegressor, + LassoRegressor, + LinearRegressor, + RandomForestRegressor, + Regressor, + RidgeRegressor, + SupportVectorRegressor, +) def _fit_single_model(model: Regressor, train_data: TabularDataset) -> Regressor: diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index 996e32c0a..9d636718b 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -1,7 +1,11 @@ import pytest from safeds.data.tabular.containers import Table -from safeds.exceptions import DatasetMissesDataError, ColumnTypeError, FeatureDataMismatchError, \ - DatasetMissesTargetError +from safeds.exceptions import ( + ColumnTypeError, + DatasetMissesDataError, + DatasetMissesTargetError, + FeatureDataMismatchError, +) from safeds.ml.classical.classification import BaselineClassifier diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index 35fefb75e..7eb26d350 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -1,7 +1,11 @@ import pytest from safeds.data.tabular.containers import Table -from safeds.exceptions import DatasetMissesTargetError -from safeds.exceptions import ColumnTypeError, DatasetMissesDataError, FeatureDataMismatchError +from safeds.exceptions import ( + ColumnTypeError, + DatasetMissesDataError, + DatasetMissesTargetError, + FeatureDataMismatchError, +) from safeds.ml.classical.regression import BaselineRegressor @@ -56,7 +60,6 @@ def test_should_raise_if_predict_data_has_differing_features(self) -> None: with pytest.raises(FeatureDataMismatchError): model.predict(predict_data) - def test_should_raise_if_predict_data_misses_target_column(self) -> None: model = BaselineRegressor() fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") From 310513b364eb0aaee08840c1a8e90e1a143fff2d Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 1 Jun 2024 23:55:17 +0200 Subject: [PATCH 17/23] fix model tests --- src/safeds/exceptions/_ml.py | 2 +- tests/safeds/ml/nn/test_model.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py index 1aabbf1bb..4ab165771 100644 --- a/src/safeds/exceptions/_ml.py +++ b/src/safeds/exceptions/_ml.py @@ -82,7 +82,7 @@ class FeatureDataMismatchError(Exception): def __init__(self) -> None: super().__init__( - "The features in the given table do not match with the specified feature columns names of model.", + "The features in the given table do not match with the specified feature columns names of the model.", ) diff --git a/tests/safeds/ml/nn/test_model.py b/tests/safeds/ml/nn/test_model.py index 5b8022a2c..04975127f 100644 --- a/tests/safeds/ml/nn/test_model.py +++ b/tests/safeds/ml/nn/test_model.py @@ -210,7 +210,7 @@ def test_should_raise_if_test_features_mismatch(self, device: Device) -> None: ) with pytest.raises( FeatureDataMismatchError, - match="The features in the given table do not match with the specified feature columns names of the neural network.", + match="The features in the given table do not match with the specified feature columns names of the model.", ): model.predict( Table.from_dict({"a": [1], "c": [2]}), @@ -227,7 +227,7 @@ def test_should_raise_if_train_features_mismatch(self, device: Device) -> None: ) with pytest.raises( FeatureDataMismatchError, - match="The features in the given table do not match with the specified feature columns names of the neural network.", + match="The features in the given table do not match with the specified feature columns names of the model.", ): learned_model.fit(Table.from_dict({"k": [0.1, 0, 0.2], "l": [0, 0.15, 0.5]}).to_tabular_dataset("k")) @@ -586,7 +586,7 @@ def test_should_raise_if_test_features_mismatch(self, device: Device) -> None: ) with pytest.raises( FeatureDataMismatchError, - match="The features in the given table do not match with the specified feature columns names of the neural network.", + match="The features in the given table do not match with the specified feature columns names of the model.", ): model.predict( Table.from_dict({"a": [1], "c": [2]}), @@ -603,7 +603,7 @@ def test_should_raise_if_train_features_mismatch(self, device: Device) -> None: ) with pytest.raises( FeatureDataMismatchError, - match="The features in the given table do not match with the specified feature columns names of the neural network.", + match="The features in the given table do not match with the specified feature columns names of the model.", ): trained_model.fit( Table.from_dict({"k": [1, 0, 2], "l": [0, 15, 5]}).to_tabular_dataset("l"), From b96fbf388d663376046b7d324fbc1c7fcfc0b908 Mon Sep 17 00:00:00 2001 From: Simon Date: Sun, 2 Jun 2024 13:25:21 +0200 Subject: [PATCH 18/23] fix codecov --- .../ml/classical/classification/_baseline_classifier.py | 6 +++--- src/safeds/ml/classical/regression/_baseline_regressor.py | 6 +++--- .../classical/classification/test_baseline_classifier.py | 8 +++++++- .../ml/classical/regression/test_baseline_regressor.py | 8 +++++++- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index f38043d20..cdd97f5ed 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -21,11 +21,11 @@ def _fit_single_model(model: Classifier, train_data: TabularDataset) -> Classifier: - return model.fit(train_data) + return model.fit(train_data) # pragma: no cover def _predict_single_model(model: Classifier, test_data: TabularDataset) -> TabularDataset: - return model.predict(test_data) + return model.predict(test_data) # pragma: no cover class BaselineClassifier: @@ -47,7 +47,7 @@ def __init__(self, extended_search: bool = False): RandomForestClassifier(), ] if extended_search: - self._list_of_model_types.extend([GradientBoostingClassifier()]) + self._list_of_model_types.extend([GradientBoostingClassifier()]) # pragma: no cover self._fitted_models: list[Classifier] = [] self._feature_names: list[str] | None = None diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index f83d0c294..2e7663c32 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -25,11 +25,11 @@ def _fit_single_model(model: Regressor, train_data: TabularDataset) -> Regressor: - return model.fit(train_data) + return model.fit(train_data) # pragma: no cover def _predict_single_model(model: Regressor, test_data: TabularDataset) -> TabularDataset: - return model.predict(test_data) + return model.predict(test_data) # pragma: no cover class BaselineRegressor: @@ -54,7 +54,7 @@ def __init__(self, include_slower_models: bool = False): ] if include_slower_models: - self._list_of_model_types.extend([ElasticNetRegressor(), LassoRegressor(), GradientBoostingRegressor()]) + self._list_of_model_types.extend([ElasticNetRegressor(), LassoRegressor(), GradientBoostingRegressor()]) # pragma: no cover self._fitted_models: list[Regressor] = [] self._feature_names: list[str] | None = None diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index 9d636718b..ebfe38102 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -4,7 +4,7 @@ ColumnTypeError, DatasetMissesDataError, DatasetMissesTargetError, - FeatureDataMismatchError, + FeatureDataMismatchError, ModelNotFittedError, ) from safeds.ml.classical.classification import BaselineClassifier @@ -52,6 +52,12 @@ def test_should_raise_if_is_fitted_is_set_correctly(self) -> None: model = model.fit(data) assert model.is_fitted + def test_should_raise_if_model_not_fitted(self) -> None: + model = BaselineClassifier() + predict_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + with pytest.raises(ModelNotFittedError): + model.predict(predict_data) + def test_should_raise_if_predict_data_has_differing_features(self) -> None: model = BaselineClassifier() fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index 7eb26d350..27e6f5007 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -4,7 +4,7 @@ ColumnTypeError, DatasetMissesDataError, DatasetMissesTargetError, - FeatureDataMismatchError, + FeatureDataMismatchError, ModelNotFittedError, ) from safeds.ml.classical.regression import BaselineRegressor @@ -52,6 +52,12 @@ def test_should_raise_if_is_fitted_is_set_correctly(self) -> None: model = model.fit(data) assert model.is_fitted + def test_should_raise_if_model_not_fitted(self) -> None: + model = BaselineRegressor() + predict_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + with pytest.raises(ModelNotFittedError): + model.predict(predict_data) + def test_should_raise_if_predict_data_has_differing_features(self) -> None: model = BaselineRegressor() fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") From 8a15d456b5086c1a214bb472af1bf589c8c9e244 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Sun, 2 Jun 2024 11:27:00 +0000 Subject: [PATCH 19/23] style: apply automated linter fixes --- src/safeds/ml/classical/regression/_baseline_regressor.py | 4 +++- .../ml/classical/classification/test_baseline_classifier.py | 3 ++- .../safeds/ml/classical/regression/test_baseline_regressor.py | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index 2e7663c32..1d3da7c51 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -54,7 +54,9 @@ def __init__(self, include_slower_models: bool = False): ] if include_slower_models: - self._list_of_model_types.extend([ElasticNetRegressor(), LassoRegressor(), GradientBoostingRegressor()]) # pragma: no cover + self._list_of_model_types.extend( + [ElasticNetRegressor(), LassoRegressor(), GradientBoostingRegressor()], + ) # pragma: no cover self._fitted_models: list[Regressor] = [] self._feature_names: list[str] | None = None diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index ebfe38102..7d586f0b6 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -4,7 +4,8 @@ ColumnTypeError, DatasetMissesDataError, DatasetMissesTargetError, - FeatureDataMismatchError, ModelNotFittedError, + FeatureDataMismatchError, + ModelNotFittedError, ) from safeds.ml.classical.classification import BaselineClassifier diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index 27e6f5007..05a5b3233 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -4,7 +4,8 @@ ColumnTypeError, DatasetMissesDataError, DatasetMissesTargetError, - FeatureDataMismatchError, ModelNotFittedError, + FeatureDataMismatchError, + ModelNotFittedError, ) from safeds.ml.classical.regression import BaselineRegressor From af06e733a677845a3a53217dbbd49d75f2a78cb9 Mon Sep 17 00:00:00 2001 From: Simon Date: Mon, 3 Jun 2024 16:40:32 +0200 Subject: [PATCH 20/23] rename DatasetMissesTargetError to TargetDataMismatchError --- src/safeds/exceptions/__init__.py | 4 ++-- src/safeds/exceptions/_ml.py | 14 +++++++++----- .../classification/_baseline_classifier.py | 8 ++++---- .../ml/classical/regression/_baseline_regressor.py | 8 ++++---- .../classification/test_baseline_classifier.py | 4 ++-- .../regression/test_baseline_regressor.py | 4 ++-- 6 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py index 65ee819d8..4c2241941 100644 --- a/src/safeds/exceptions/__init__.py +++ b/src/safeds/exceptions/__init__.py @@ -16,7 +16,7 @@ from ._ml import ( DatasetMissesDataError, DatasetMissesFeaturesError, - DatasetMissesTargetError, + TargetDataMismatchError, FeatureDataMismatchError, InputSizeError, InvalidModelStructureError, @@ -69,7 +69,7 @@ class OutOfBoundsError(SafeDsError): # ML exceptions "DatasetMissesDataError", "DatasetMissesFeaturesError", - "DatasetMissesTargetError", + "TargetDataMismatchError", "FeatureDataMismatchError", "InputSizeError", "InvalidModelStructureError", diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py index e92863d1f..f95360455 100644 --- a/src/safeds/exceptions/_ml.py +++ b/src/safeds/exceptions/_ml.py @@ -15,18 +15,22 @@ def __init__(self, missing_feature_names: list[str]): super().__init__(f"Dataset misses the feature columns '{missing_feature_names}'.") -class DatasetMissesTargetError(ValueError): +class TargetDataMismatchError(ValueError): """ - Raised when a dataset misses the target column. + Raised when the target column of a test dataset mismatches with the target column of the training dataset. + + Currently only used in the Baseline Models. Parameters ---------- + actual_target_name: + The actual target column of the dataset. missing_target_name: - The names of the missing target column. + The name of the missing target column. """ - def __init__(self, missing_target_name: str): - super().__init__(f"Dataset misses the target column '{missing_target_name}'.") + def __init__(self, actual_target_name: str, missing_target_name: str): + super().__init__(f"The provided target column '{actual_target_name}' does not match the target column of the training set '{missing_target_name}'.") class DatasetMissesDataError(ValueError): diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index cdd97f5ed..907b69b9d 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -6,7 +6,7 @@ from safeds.data.labeled.containers import TabularDataset from safeds.exceptions import ( DatasetMissesDataError, - DatasetMissesTargetError, + TargetDataMismatchError, FeatureDataMismatchError, ModelNotFittedError, ) @@ -124,8 +124,8 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: If the features of the test data do not match with the features of the trained Classifier. DatasetMissesDataError If the given test_data contains no data. - DatasetMissesTargetError - If the given test_data misses the target column. + TargetDataMismatchError + If the target column of the test data does not match the target column of the training data. ColumnTypeError If one or more columns contain non-numeric values. """ @@ -140,7 +140,7 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: if not self._feature_names == test_data.features.column_names: raise FeatureDataMismatchError if not self._target_name == test_data.target.name: - raise DatasetMissesTargetError(self._target_name) + raise TargetDataMismatchError(actual_target_name=test_data.target.name, missing_target_name=self._target_name) test_data_as_table = test_data.to_table() if test_data_as_table.row_count == 0: raise DatasetMissesDataError diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index 1d3da7c51..29e91ff7a 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -6,7 +6,7 @@ from safeds.data.labeled.containers import TabularDataset from safeds.exceptions import ( DatasetMissesDataError, - DatasetMissesTargetError, + TargetDataMismatchError, FeatureDataMismatchError, ModelNotFittedError, ) @@ -133,8 +133,8 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: If the features of the test data do not match with the features of the trained Regressor. DatasetMissesDataError If the given test_data contains no data. - DatasetMissesTargetError - If the given test_data misses the target column. + TargetDataMismatchError + If the target column of the test data does not match the target column of the training data. ColumnTypeError If one or more columns contain non-numeric values. """ @@ -149,7 +149,7 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: if not self._feature_names == test_data.features.column_names: raise FeatureDataMismatchError if not self._target_name == test_data.target.name: - raise DatasetMissesTargetError(self._target_name) + raise TargetDataMismatchError(actual_target_name=test_data.target.name, missing_target_name=self._target_name) test_data_as_table = test_data.to_table() if test_data_as_table.row_count == 0: raise DatasetMissesDataError diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index 7d586f0b6..b1d4d014e 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -3,7 +3,7 @@ from safeds.exceptions import ( ColumnTypeError, DatasetMissesDataError, - DatasetMissesTargetError, + TargetDataMismatchError, FeatureDataMismatchError, ModelNotFittedError, ) @@ -72,7 +72,7 @@ def test_should_raise_if_predict_data_misses_target_column(self) -> None: fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") predict_data = Table({"feat": [0, 1], "other": [0, 1]}).to_tabular_dataset("other") model = model.fit(fit_data) - with pytest.raises(DatasetMissesTargetError): + with pytest.raises(TargetDataMismatchError): model.predict(predict_data) def test_check_predict_return_type_and_values(self) -> None: diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index 05a5b3233..3b529e961 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -3,7 +3,7 @@ from safeds.exceptions import ( ColumnTypeError, DatasetMissesDataError, - DatasetMissesTargetError, + TargetDataMismatchError, FeatureDataMismatchError, ModelNotFittedError, ) @@ -72,7 +72,7 @@ def test_should_raise_if_predict_data_misses_target_column(self) -> None: fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") predict_data = Table({"feat": [0, 1], "other": [0, 1]}).to_tabular_dataset("other") model = model.fit(fit_data) - with pytest.raises(DatasetMissesTargetError): + with pytest.raises(TargetDataMismatchError): model.predict(predict_data) def test_check_predict_return_type_and_values(self) -> None: From faab0a864d5230bcb8d076e2bc5c728381fe1023 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Mon, 3 Jun 2024 14:42:21 +0000 Subject: [PATCH 21/23] style: apply automated linter fixes --- src/safeds/exceptions/__init__.py | 2 +- src/safeds/exceptions/_ml.py | 4 +++- .../ml/classical/classification/_baseline_classifier.py | 6 ++++-- src/safeds/ml/classical/regression/_baseline_regressor.py | 6 ++++-- .../ml/classical/classification/test_baseline_classifier.py | 2 +- .../ml/classical/regression/test_baseline_regressor.py | 2 +- 6 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py index 4c2241941..5cb3f6eaa 100644 --- a/src/safeds/exceptions/__init__.py +++ b/src/safeds/exceptions/__init__.py @@ -16,7 +16,6 @@ from ._ml import ( DatasetMissesDataError, DatasetMissesFeaturesError, - TargetDataMismatchError, FeatureDataMismatchError, InputSizeError, InvalidModelStructureError, @@ -24,6 +23,7 @@ ModelNotFittedError, PlainTableError, PredictionError, + TargetDataMismatchError, ) diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py index f95360455..d07ae27ed 100644 --- a/src/safeds/exceptions/_ml.py +++ b/src/safeds/exceptions/_ml.py @@ -30,7 +30,9 @@ class TargetDataMismatchError(ValueError): """ def __init__(self, actual_target_name: str, missing_target_name: str): - super().__init__(f"The provided target column '{actual_target_name}' does not match the target column of the training set '{missing_target_name}'.") + super().__init__( + f"The provided target column '{actual_target_name}' does not match the target column of the training set '{missing_target_name}'.", + ) class DatasetMissesDataError(ValueError): diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index 907b69b9d..4dbdecf80 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -6,9 +6,9 @@ from safeds.data.labeled.containers import TabularDataset from safeds.exceptions import ( DatasetMissesDataError, - TargetDataMismatchError, FeatureDataMismatchError, ModelNotFittedError, + TargetDataMismatchError, ) from safeds.ml.classical.classification import ( AdaBoostClassifier, @@ -140,7 +140,9 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: if not self._feature_names == test_data.features.column_names: raise FeatureDataMismatchError if not self._target_name == test_data.target.name: - raise TargetDataMismatchError(actual_target_name=test_data.target.name, missing_target_name=self._target_name) + raise TargetDataMismatchError( + actual_target_name=test_data.target.name, missing_target_name=self._target_name, + ) test_data_as_table = test_data.to_table() if test_data_as_table.row_count == 0: raise DatasetMissesDataError diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index 29e91ff7a..89511b3ee 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -6,9 +6,9 @@ from safeds.data.labeled.containers import TabularDataset from safeds.exceptions import ( DatasetMissesDataError, - TargetDataMismatchError, FeatureDataMismatchError, ModelNotFittedError, + TargetDataMismatchError, ) from safeds.ml.classical.regression import ( AdaBoostRegressor, @@ -149,7 +149,9 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: if not self._feature_names == test_data.features.column_names: raise FeatureDataMismatchError if not self._target_name == test_data.target.name: - raise TargetDataMismatchError(actual_target_name=test_data.target.name, missing_target_name=self._target_name) + raise TargetDataMismatchError( + actual_target_name=test_data.target.name, missing_target_name=self._target_name, + ) test_data_as_table = test_data.to_table() if test_data_as_table.row_count == 0: raise DatasetMissesDataError diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index b1d4d014e..f8c4f969c 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -3,9 +3,9 @@ from safeds.exceptions import ( ColumnTypeError, DatasetMissesDataError, - TargetDataMismatchError, FeatureDataMismatchError, ModelNotFittedError, + TargetDataMismatchError, ) from safeds.ml.classical.classification import BaselineClassifier diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index 3b529e961..174f4c160 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -3,9 +3,9 @@ from safeds.exceptions import ( ColumnTypeError, DatasetMissesDataError, - TargetDataMismatchError, FeatureDataMismatchError, ModelNotFittedError, + TargetDataMismatchError, ) from safeds.ml.classical.regression import BaselineRegressor From abab90b3360a3233b0c179ff321d4f5b0c476a33 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Mon, 3 Jun 2024 14:44:01 +0000 Subject: [PATCH 22/23] style: apply automated linter fixes --- src/safeds/ml/classical/classification/_baseline_classifier.py | 3 ++- src/safeds/ml/classical/regression/_baseline_regressor.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py index 4dbdecf80..7b58d61e2 100644 --- a/src/safeds/ml/classical/classification/_baseline_classifier.py +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -141,7 +141,8 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: raise FeatureDataMismatchError if not self._target_name == test_data.target.name: raise TargetDataMismatchError( - actual_target_name=test_data.target.name, missing_target_name=self._target_name, + actual_target_name=test_data.target.name, + missing_target_name=self._target_name, ) test_data_as_table = test_data.to_table() if test_data_as_table.row_count == 0: diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py index 89511b3ee..4562ed122 100644 --- a/src/safeds/ml/classical/regression/_baseline_regressor.py +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -150,7 +150,8 @@ def predict(self, test_data: TabularDataset) -> dict[str, float]: raise FeatureDataMismatchError if not self._target_name == test_data.target.name: raise TargetDataMismatchError( - actual_target_name=test_data.target.name, missing_target_name=self._target_name, + actual_target_name=test_data.target.name, + missing_target_name=self._target_name, ) test_data_as_table = test_data.to_table() if test_data_as_table.row_count == 0: From b8229f646b2d6daf8734a925eebad24fa19de465 Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 25 Jun 2024 22:30:40 +0200 Subject: [PATCH 23/23] remove todos --- .../ml/classical/classification/test_baseline_classifier.py | 2 -- tests/safeds/ml/classical/regression/test_baseline_regressor.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py index b1d4d014e..1af06a0b2 100644 --- a/tests/safeds/ml/classical/classification/test_baseline_classifier.py +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -10,8 +10,6 @@ from safeds.ml.classical.classification import BaselineClassifier -# TODO To test predict cases, we have to fit the model first which takes a couple seconds each time. Find a way to -# TODO only fit a model once and pass it to all predict test cases. class TestBaselineClassifier: def test_should_raise_if_fit_dataset_contains_no_data(self) -> None: model = BaselineClassifier() diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py index 3b529e961..57bc4873c 100644 --- a/tests/safeds/ml/classical/regression/test_baseline_regressor.py +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -10,8 +10,6 @@ from safeds.ml.classical.regression import BaselineRegressor -# TODO To test predict cases, we have to fit the model first which takes a couple seconds each time. Find a way to -# TODO only fit a model once and pass it to all predict test cases. class TestBaselineRegressor: def test_should_raise_if_fit_dataset_contains_no_data(self) -> None: model = BaselineRegressor()