From 66f5f647820b3bf36a6041f04d6e8547170b1c81 Mon Sep 17 00:00:00 2001 From: Alex Senger <91055000+alex-senger@users.noreply.github.com> Date: Fri, 16 Jun 2023 14:50:16 +0200 Subject: [PATCH] feat: Improve Error Handling of classifiers and regressors (#355) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #153 . ### Summary of Changes The Error Handling of classifiers and regressors was not ideal. Now every classifier and regressor validates the data, especially when given non-numerical or missing values in the table. --------- Co-authored-by: Alexander Gréus Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Co-authored-by: Severin Paul Höfer <84280965+zzril@users.noreply.github.com> Co-authored-by: Alexander <47296670+Marsmaennchen221@users.noreply.github.com> Co-authored-by: Junior Atemebang <129027012+jxnior01@users.noreply.github.com> --- src/safeds/exceptions/__init__.py | 4 + src/safeds/exceptions/_data.py | 23 +++- src/safeds/exceptions/_ml.py | 7 + src/safeds/ml/classical/_util_sklearn.py | 72 ++++++++++ .../ml/classical/classification/_ada_boost.py | 14 ++ .../classification/_decision_tree.py | 14 ++ .../classification/_gradient_boosting.py | 14 ++ .../classification/_k_nearest_neighbors.py | 17 +++ .../classification/_logistic_regression.py | 14 ++ .../classification/_random_forest.py | 14 ++ .../classification/_support_vector_machine.py | 14 ++ .../ml/classical/regression/_ada_boost.py | 14 ++ .../ml/classical/regression/_decision_tree.py | 14 ++ .../regression/_elastic_net_regression.py | 14 ++ .../regression/_gradient_boosting.py | 14 ++ .../regression/_k_nearest_neighbors.py | 17 +++ .../classical/regression/_lasso_regression.py | 14 ++ .../regression/_linear_regression.py | 14 ++ .../ml/classical/regression/_random_forest.py | 14 ++ .../classical/regression/_ridge_regression.py | 14 ++ .../regression/_support_vector_machine.py | 14 ++ .../containers/_table/test_plot_boxplots.py | 2 +- .../classification/test_classifier.py | 130 +++++++++++++++--- .../ml/classical/regression/test_regressor.py | 130 +++++++++++++++--- .../safeds/ml/classical/test_util_sklearn.py | 37 +++++ 25 files changed, 604 insertions(+), 45 deletions(-) diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py index 48af59135..05bf9dcf8 100644 --- a/src/safeds/exceptions/__init__.py +++ b/src/safeds/exceptions/__init__.py @@ -5,6 +5,7 @@ ColumnSizeError, DuplicateColumnNameError, IndexOutOfBoundsError, + MissingValuesColumnError, NonNumericColumnError, SchemaMismatchError, TransformerNotFittedError, @@ -14,6 +15,7 @@ ) from safeds.exceptions._ml import ( DatasetContainsTargetError, + DatasetMissesDataError, DatasetMissesFeaturesError, LearningError, ModelNotFittedError, @@ -33,6 +35,7 @@ "UnknownColumnNameError", "ValueNotPresentWhenFittedError", "WrongFileExtensionError", + "MissingValuesColumnError", # ML exceptions "DatasetContainsTargetError", "DatasetMissesFeaturesError", @@ -40,4 +43,5 @@ "ModelNotFittedError", "PredictionError", "UntaggedTableError", + "DatasetMissesDataError", ] diff --git a/src/safeds/exceptions/_data.py b/src/safeds/exceptions/_data.py index 26e402c1c..2278efbfd 100644 --- a/src/safeds/exceptions/_data.py +++ b/src/safeds/exceptions/_data.py @@ -23,8 +23,27 @@ def __init__(self, column_names: list[str]): class NonNumericColumnError(Exception): """Exception raised for trying to do numerical operations on a non-numerical column.""" - def __init__(self, column_info: str) -> None: - super().__init__(f"Tried to do a numerical operation on one or multiple non numerical Columns: \n{column_info}") + def __init__(self, column_info: str, help_msg: str | None = None) -> None: + line_break = "\n" + super().__init__( + ( + "Tried to do a numerical operation on one or multiple non-numerical columns:" + f" \n{column_info}{line_break + help_msg if help_msg is not None else ''}" + ), + ) + + +class MissingValuesColumnError(Exception): + """Exception raised for trying to do operations on columns containing missing values.""" + + def __init__(self, column_info: str, help_msg: str | None = None) -> None: + line_break = "\n" + super().__init__( + ( + "Tried to do an operation on one or multiple columns containing missing values:" + f" \n{column_info}{line_break + help_msg if help_msg is not None else ''}" + ), + ) class DuplicateColumnNameError(Exception): diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py index bf82d7b9c..1cc83ebc0 100644 --- a/src/safeds/exceptions/_ml.py +++ b/src/safeds/exceptions/_ml.py @@ -26,6 +26,13 @@ def __init__(self, missing_feature_names: list[str]): super().__init__(f"Dataset misses the feature columns '{missing_feature_names}'.") +class DatasetMissesDataError(ValueError): + """Raised when a dataset contains no rows.""" + + def __init__(self) -> None: + super().__init__("Dataset contains no rows") + + class LearningError(Exception): """ Raised when an error occurred while training a model. diff --git a/src/safeds/ml/classical/_util_sklearn.py b/src/safeds/ml/classical/_util_sklearn.py index 6b581c3d8..19c13e5b3 100644 --- a/src/safeds/ml/classical/_util_sklearn.py +++ b/src/safeds/ml/classical/_util_sklearn.py @@ -4,9 +4,12 @@ from safeds.data.tabular.containers import Table, TaggedTable from safeds.exceptions import ( DatasetContainsTargetError, + DatasetMissesDataError, DatasetMissesFeaturesError, LearningError, + MissingValuesColumnError, ModelNotFittedError, + NonNumericColumnError, PredictionError, UntaggedTableError, ) @@ -30,9 +33,44 @@ def fit(model: Any, tagged_table: TaggedTable) -> None: If the tagged table contains invalid values or if the training failed. UntaggedTableError If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ if not isinstance(tagged_table, TaggedTable) and isinstance(tagged_table, Table): raise UntaggedTableError + + if tagged_table.number_of_rows == 0: + raise DatasetMissesDataError + + non_numerical_column_names = set(tagged_table.features.column_names) - set( + tagged_table.features.remove_columns_with_non_numerical_values().column_names, + ) + if len(non_numerical_column_names) != 0: + raise NonNumericColumnError( + str(non_numerical_column_names), + ( + "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" + " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" + " different values\nor is ordinal, you should use the LabelEncoder." + ), + ) + + null_containing_column_names = set(tagged_table.features.column_names) - set( + tagged_table.features.remove_columns_with_missing_values().column_names, + ) + if len(null_containing_column_names) != 0: + raise MissingValuesColumnError( + str(null_containing_column_names), + ( + "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" + " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`." + ), + ) + try: model.fit( tagged_table.features._data, @@ -73,6 +111,12 @@ def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_ If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ # Validation if model is None or target_name is None or feature_names is None: @@ -83,6 +127,34 @@ def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_ if missing_feature_names: raise DatasetMissesFeaturesError(missing_feature_names) + if dataset.number_of_rows == 0: + raise DatasetMissesDataError + + non_numerical_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set( + dataset.keep_only_columns(feature_names).remove_columns_with_non_numerical_values().column_names, + ) + if len(non_numerical_column_names) != 0: + raise NonNumericColumnError( + str(non_numerical_column_names), + ( + "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" + " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" + " different values\nor is ordinal, you should use the LabelEncoder.\n" + ), + ) + + null_containing_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set( + dataset.keep_only_columns(feature_names).remove_columns_with_missing_values().column_names, + ) + if len(null_containing_column_names) != 0: + raise MissingValuesColumnError( + str(null_containing_column_names), + ( + "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" + " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`." + ), + ) + dataset_df = dataset.keep_only_columns(feature_names)._data dataset_df.columns = feature_names diff --git a/src/safeds/ml/classical/classification/_ada_boost.py b/src/safeds/ml/classical/classification/_ada_boost.py index 18b34d19d..86ffde895 100644 --- a/src/safeds/ml/classical/classification/_ada_boost.py +++ b/src/safeds/ml/classical/classification/_ada_boost.py @@ -90,6 +90,14 @@ def fit(self, training_set: TaggedTable) -> AdaBoost: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_classifier = self._get_sklearn_classifier() fit(wrapped_classifier, training_set) @@ -129,6 +137,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/classification/_decision_tree.py b/src/safeds/ml/classical/classification/_decision_tree.py index fd5f24a4b..5b183e8b4 100644 --- a/src/safeds/ml/classical/classification/_decision_tree.py +++ b/src/safeds/ml/classical/classification/_decision_tree.py @@ -43,6 +43,14 @@ def fit(self, training_set: TaggedTable) -> DecisionTree: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_classifier = self._get_sklearn_classifier() fit(wrapped_classifier, training_set) @@ -78,6 +86,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/classification/_gradient_boosting.py b/src/safeds/ml/classical/classification/_gradient_boosting.py index c94a95f7d..17b3a4205 100644 --- a/src/safeds/ml/classical/classification/_gradient_boosting.py +++ b/src/safeds/ml/classical/classification/_gradient_boosting.py @@ -77,6 +77,14 @@ def fit(self, training_set: TaggedTable) -> GradientBoosting: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_classifier = self._get_sklearn_classifier() fit(wrapped_classifier, training_set) @@ -112,6 +120,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/classification/_k_nearest_neighbors.py b/src/safeds/ml/classical/classification/_k_nearest_neighbors.py index df9bb7e79..a7eaa4ff0 100644 --- a/src/safeds/ml/classical/classification/_k_nearest_neighbors.py +++ b/src/safeds/ml/classical/classification/_k_nearest_neighbors.py @@ -4,6 +4,7 @@ from sklearn.neighbors import KNeighborsClassifier as sk_KNeighborsClassifier +from safeds.exceptions import DatasetMissesDataError from safeds.ml.classical._util_sklearn import fit, predict from ._classifier import Classifier @@ -69,7 +70,17 @@ def fit(self, training_set: TaggedTable) -> KNearestNeighbors: If `number_of_neighbors` is greater than the sample size. LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ + if training_set.number_of_rows == 0: + raise DatasetMissesDataError if self._number_of_neighbors > training_set.number_of_rows: raise ValueError( ( @@ -111,6 +122,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/classification/_logistic_regression.py b/src/safeds/ml/classical/classification/_logistic_regression.py index 864aa91d4..b2a5cbeba 100644 --- a/src/safeds/ml/classical/classification/_logistic_regression.py +++ b/src/safeds/ml/classical/classification/_logistic_regression.py @@ -43,6 +43,14 @@ def fit(self, training_set: TaggedTable) -> LogisticRegression: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_classifier = self._get_sklearn_classifier() fit(wrapped_classifier, training_set) @@ -78,6 +86,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/classification/_random_forest.py b/src/safeds/ml/classical/classification/_random_forest.py index 5f0bdf26c..c50b063e0 100644 --- a/src/safeds/ml/classical/classification/_random_forest.py +++ b/src/safeds/ml/classical/classification/_random_forest.py @@ -65,6 +65,14 @@ def fit(self, training_set: TaggedTable) -> RandomForest: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_classifier = self._get_sklearn_classifier() fit(wrapped_classifier, training_set) @@ -100,6 +108,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/classification/_support_vector_machine.py b/src/safeds/ml/classical/classification/_support_vector_machine.py index 5af9ee825..11c5765d8 100644 --- a/src/safeds/ml/classical/classification/_support_vector_machine.py +++ b/src/safeds/ml/classical/classification/_support_vector_machine.py @@ -119,6 +119,14 @@ def fit(self, training_set: TaggedTable) -> SupportVectorMachine: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_classifier = self._get_sklearn_classifier() fit(wrapped_classifier, training_set) @@ -154,6 +162,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/regression/_ada_boost.py b/src/safeds/ml/classical/regression/_ada_boost.py index e3ba73ca2..9e4254292 100644 --- a/src/safeds/ml/classical/regression/_ada_boost.py +++ b/src/safeds/ml/classical/regression/_ada_boost.py @@ -90,6 +90,14 @@ def fit(self, training_set: TaggedTable) -> AdaBoost: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_regressor = self._get_sklearn_regressor() fit(wrapped_regressor, training_set) @@ -129,6 +137,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/regression/_decision_tree.py b/src/safeds/ml/classical/regression/_decision_tree.py index be354f695..91beb47fb 100644 --- a/src/safeds/ml/classical/regression/_decision_tree.py +++ b/src/safeds/ml/classical/regression/_decision_tree.py @@ -43,6 +43,14 @@ def fit(self, training_set: TaggedTable) -> DecisionTree: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_regressor = self._get_sklearn_regressor() fit(wrapped_regressor, training_set) @@ -78,6 +86,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/regression/_elastic_net_regression.py b/src/safeds/ml/classical/regression/_elastic_net_regression.py index e615ac870..bb8951814 100644 --- a/src/safeds/ml/classical/regression/_elastic_net_regression.py +++ b/src/safeds/ml/classical/regression/_elastic_net_regression.py @@ -102,6 +102,14 @@ def fit(self, training_set: TaggedTable) -> ElasticNetRegression: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_regressor = self._get_sklearn_regressor() fit(wrapped_regressor, training_set) @@ -137,6 +145,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/regression/_gradient_boosting.py b/src/safeds/ml/classical/regression/_gradient_boosting.py index d8cb47fa3..c851f948f 100644 --- a/src/safeds/ml/classical/regression/_gradient_boosting.py +++ b/src/safeds/ml/classical/regression/_gradient_boosting.py @@ -77,6 +77,14 @@ def fit(self, training_set: TaggedTable) -> GradientBoosting: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_regressor = self._get_sklearn_regressor() fit(wrapped_regressor, training_set) @@ -112,6 +120,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/regression/_k_nearest_neighbors.py b/src/safeds/ml/classical/regression/_k_nearest_neighbors.py index da1e4d20d..f99b4d66e 100644 --- a/src/safeds/ml/classical/regression/_k_nearest_neighbors.py +++ b/src/safeds/ml/classical/regression/_k_nearest_neighbors.py @@ -4,6 +4,7 @@ from sklearn.neighbors import KNeighborsRegressor as sk_KNeighborsRegressor +from safeds.exceptions import DatasetMissesDataError from safeds.ml.classical._util_sklearn import fit, predict from ._regressor import Regressor @@ -69,7 +70,17 @@ def fit(self, training_set: TaggedTable) -> KNearestNeighbors: If `number_of_neighbors` is greater than the sample size. LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ + if training_set.number_of_rows == 0: + raise DatasetMissesDataError if self._number_of_neighbors > training_set.number_of_rows: raise ValueError( ( @@ -112,6 +123,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/regression/_lasso_regression.py b/src/safeds/ml/classical/regression/_lasso_regression.py index bc175e5f9..857db67a5 100644 --- a/src/safeds/ml/classical/regression/_lasso_regression.py +++ b/src/safeds/ml/classical/regression/_lasso_regression.py @@ -75,6 +75,14 @@ def fit(self, training_set: TaggedTable) -> LassoRegression: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_regressor = self._get_sklearn_regressor() fit(wrapped_regressor, training_set) @@ -110,6 +118,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/regression/_linear_regression.py b/src/safeds/ml/classical/regression/_linear_regression.py index 4c60da473..1c60a7114 100644 --- a/src/safeds/ml/classical/regression/_linear_regression.py +++ b/src/safeds/ml/classical/regression/_linear_regression.py @@ -43,6 +43,14 @@ def fit(self, training_set: TaggedTable) -> LinearRegression: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_regressor = self._get_sklearn_regressor() fit(wrapped_regressor, training_set) @@ -78,6 +86,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/regression/_random_forest.py b/src/safeds/ml/classical/regression/_random_forest.py index b3c6f878b..08d8c9883 100644 --- a/src/safeds/ml/classical/regression/_random_forest.py +++ b/src/safeds/ml/classical/regression/_random_forest.py @@ -65,6 +65,14 @@ def fit(self, training_set: TaggedTable) -> RandomForest: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_regressor = self._get_sklearn_regressor() fit(wrapped_regressor, training_set) @@ -100,6 +108,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/regression/_ridge_regression.py b/src/safeds/ml/classical/regression/_ridge_regression.py index 002bf26c7..c57b77b43 100644 --- a/src/safeds/ml/classical/regression/_ridge_regression.py +++ b/src/safeds/ml/classical/regression/_ridge_regression.py @@ -76,6 +76,14 @@ def fit(self, training_set: TaggedTable) -> RidgeRegression: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_regressor = self._get_sklearn_regressor() fit(wrapped_regressor, training_set) @@ -111,6 +119,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) diff --git a/src/safeds/ml/classical/regression/_support_vector_machine.py b/src/safeds/ml/classical/regression/_support_vector_machine.py index d22858261..d43da828d 100644 --- a/src/safeds/ml/classical/regression/_support_vector_machine.py +++ b/src/safeds/ml/classical/regression/_support_vector_machine.py @@ -119,6 +119,14 @@ def fit(self, training_set: TaggedTable) -> SupportVectorMachine: ------ LearningError If the training data contains invalid values or if the training failed. + UntaggedTableError + If the table is untagged. + NonNumericColumnError + If the training data contains non-numerical values. + MissingValuesColumnError + If the training data contains missing values. + DatasetMissesDataError + If the training data contains no rows. """ wrapped_regressor = self._get_sklearn_regressor() fit(wrapped_regressor, training_set) @@ -154,6 +162,12 @@ def predict(self, dataset: Table) -> TaggedTable: If the dataset misses feature columns. PredictionError If predicting with the given dataset failed. + NonNumericColumnError + If the dataset contains non-numerical values. + MissingValuesColumnError + If the dataset contains missing values. + DatasetMissesDataError + If the dataset contains no rows. """ return predict(self._wrapped_regressor, dataset, self._feature_names, self._target_name) diff --git a/tests/safeds/data/tabular/containers/_table/test_plot_boxplots.py b/tests/safeds/data/tabular/containers/_table/test_plot_boxplots.py index 80970da40..d8f615ef7 100644 --- a/tests/safeds/data/tabular/containers/_table/test_plot_boxplots.py +++ b/tests/safeds/data/tabular/containers/_table/test_plot_boxplots.py @@ -35,7 +35,7 @@ def test_should_raise_if_column_contains_non_numerical_values() -> None: with pytest.raises( NonNumericColumnError, match=( - r"Tried to do a numerical operation on one or multiple non numerical Columns: \nThis table contains only" + r"Tried to do a numerical operation on one or multiple non-numerical columns: \nThis table contains only" r" non-numerical columns." ), ): diff --git a/tests/safeds/ml/classical/classification/test_classifier.py b/tests/safeds/ml/classical/classification/test_classifier.py index c198c46ac..311cc5292 100644 --- a/tests/safeds/ml/classical/classification/test_classifier.py +++ b/tests/safeds/ml/classical/classification/test_classifier.py @@ -1,15 +1,16 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import pytest from safeds.data.tabular.containers import Table, TaggedTable from safeds.exceptions import ( DatasetContainsTargetError, + DatasetMissesDataError, DatasetMissesFeaturesError, - LearningError, + MissingValuesColumnError, ModelNotFittedError, - PredictionError, + NonNumericColumnError, UntaggedTableError, ) from safeds.ml.classical.classification import ( @@ -63,18 +64,6 @@ def valid_data() -> TaggedTable: ).tag_columns(target_name="target", feature_names=["feat1", "feat2"]) -@pytest.fixture() -def invalid_data() -> TaggedTable: - return Table( - { - "id": [1, 4], - "feat1": ["a", 5], - "feat2": [3, 6], - "target": [0, 1], - }, - ).tag_columns(target_name="target", feature_names=["feat1", "feat2"]) - - @pytest.mark.parametrize("classifier", classifiers(), ids=lambda x: x.__class__.__name__) class TestFit: def test_should_succeed_on_valid_data(self, classifier: Classifier, valid_data: TaggedTable) -> None: @@ -91,8 +80,66 @@ def test_should_not_change_input_table(self, classifier: Classifier, request: Fi classifier.fit(valid_data) assert valid_data == valid_data_copy - def test_should_raise_on_invalid_data(self, classifier: Classifier, invalid_data: TaggedTable) -> None: - with pytest.raises(LearningError): + @pytest.mark.parametrize( + ("invalid_data", "expected_error", "expected_error_msg"), + [ + ( + Table( + { + "id": [1, 4], + "feat1": ["a", 5], + "feat2": [3, 6], + "target": [0, 1], + }, + ).tag_columns(target_name="target", feature_names=["feat1", "feat2"]), + NonNumericColumnError, + ( + r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\{'feat1'\}\nYou" + r" can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" + r" data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too" + r" many different values\nor is ordinal, you should use the LabelEncoder." + ), + ), + ( + Table( + { + "id": [1, 4], + "feat1": [None, 5], + "feat2": [3, 6], + "target": [0, 1], + }, + ).tag_columns(target_name="target", feature_names=["feat1", "feat2"]), + MissingValuesColumnError, + ( + r"Tried to do an operation on one or multiple columns containing missing values: \n\{'feat1'\}\nYou" + r" can use the Imputer to replace the missing values based on different strategies.\nIf you want to" + r" remove the missing values entirely you can use the method" + r" `Table.remove_rows_with_missing_values`." + ), + ), + ( + Table( + { + "id": [], + "feat1": [], + "feat2": [], + "target": [], + }, + ).tag_columns(target_name="target", feature_names=["feat1", "feat2"]), + DatasetMissesDataError, + r"Dataset contains no rows", + ), + ], + ids=["non-numerical data", "missing values in data", "no rows in data"], + ) + def test_should_raise_on_invalid_data( + self, + classifier: Classifier, + invalid_data: TaggedTable, + expected_error: Any, + expected_error_msg: str, + ) -> None: + with pytest.raises(expected_error, match=expected_error_msg): classifier.fit(invalid_data) @pytest.mark.parametrize( @@ -151,15 +198,56 @@ def test_should_raise_if_dataset_misses_features(self, classifier: Classifier, v with pytest.raises(DatasetMissesFeaturesError, match="[feat1, feat2]"): fitted_classifier.predict(valid_data.remove_columns(["feat1", "feat2", "target"])) + @pytest.mark.parametrize( + ("invalid_data", "expected_error", "expected_error_msg"), + [ + ( + Table( + { + "id": [1, 4], + "feat1": ["a", 5], + "feat2": [3, 6], + }, + ), + NonNumericColumnError, + r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\{'feat1'\}", + ), + ( + Table( + { + "id": [1, 4], + "feat1": [None, 5], + "feat2": [3, 6], + }, + ), + MissingValuesColumnError, + r"Tried to do an operation on one or multiple columns containing missing values: \n\{'feat1'\}", + ), + ( + Table( + { + "id": [], + "feat1": [], + "feat2": [], + }, + ), + DatasetMissesDataError, + r"Dataset contains no rows", + ), + ], + ids=["non-numerical data", "missing values in data", "no rows in data"], + ) def test_should_raise_on_invalid_data( self, classifier: Classifier, valid_data: TaggedTable, - invalid_data: TaggedTable, + invalid_data: Table, + expected_error: Any, + expected_error_msg: str, ) -> None: - fitted_classifier = classifier.fit(valid_data) - with pytest.raises(PredictionError): - fitted_classifier.predict(invalid_data.features) + classifier = classifier.fit(valid_data) + with pytest.raises(expected_error, match=expected_error_msg): + classifier.predict(invalid_data) @pytest.mark.parametrize("classifier", classifiers(), ids=lambda x: x.__class__.__name__) diff --git a/tests/safeds/ml/classical/regression/test_regressor.py b/tests/safeds/ml/classical/regression/test_regressor.py index 21aa4816d..7179d6781 100644 --- a/tests/safeds/ml/classical/regression/test_regressor.py +++ b/tests/safeds/ml/classical/regression/test_regressor.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import pandas as pd import pytest @@ -8,10 +8,11 @@ from safeds.exceptions import ( ColumnLengthMismatchError, DatasetContainsTargetError, + DatasetMissesDataError, DatasetMissesFeaturesError, - LearningError, + MissingValuesColumnError, ModelNotFittedError, - PredictionError, + NonNumericColumnError, UntaggedTableError, ) from safeds.ml.classical.regression import ( @@ -74,18 +75,6 @@ def valid_data() -> TaggedTable: ).tag_columns(target_name="target", feature_names=["feat1", "feat2"]) -@pytest.fixture() -def invalid_data() -> TaggedTable: - return Table( - { - "id": [1, 4], - "feat1": ["a", 5], - "feat2": [3, 6], - "target": [0, 1], - }, - ).tag_columns(target_name="target", feature_names=["feat1", "feat2"]) - - @pytest.mark.parametrize("regressor", regressors(), ids=lambda x: x.__class__.__name__) class TestFit: def test_should_succeed_on_valid_data(self, regressor: Regressor, valid_data: TaggedTable) -> None: @@ -102,8 +91,56 @@ def test_should_not_change_input_table(self, regressor: Regressor, request: Fixt regressor.fit(valid_data) assert valid_data == valid_data_copy - def test_should_raise_on_invalid_data(self, regressor: Regressor, invalid_data: TaggedTable) -> None: - with pytest.raises(LearningError): + @pytest.mark.parametrize( + ("invalid_data", "expected_error", "expected_error_msg"), + [ + ( + Table( + { + "id": [1, 4], + "feat1": ["a", 5], + "feat2": [3, 6], + "target": [0, 1], + }, + ).tag_columns(target_name="target", feature_names=["feat1", "feat2"]), + NonNumericColumnError, + r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\{'feat1'\}", + ), + ( + Table( + { + "id": [1, 4], + "feat1": [None, 5], + "feat2": [3, 6], + "target": [0, 1], + }, + ).tag_columns(target_name="target", feature_names=["feat1", "feat2"]), + MissingValuesColumnError, + r"Tried to do an operation on one or multiple columns containing missing values: \n\{'feat1'\}", + ), + ( + Table( + { + "id": [], + "feat1": [], + "feat2": [], + "target": [], + }, + ).tag_columns(target_name="target", feature_names=["feat1", "feat2"]), + DatasetMissesDataError, + r"Dataset contains no rows", + ), + ], + ids=["non-numerical data", "missing values in data", "no rows in data"], + ) + def test_should_raise_on_invalid_data( + self, + regressor: Regressor, + invalid_data: TaggedTable, + expected_error: Any, + expected_error_msg: str, + ) -> None: + with pytest.raises(expected_error, match=expected_error_msg): regressor.fit(invalid_data) @pytest.mark.parametrize( @@ -162,15 +199,66 @@ def test_should_raise_if_dataset_misses_features(self, regressor: Regressor, val with pytest.raises(DatasetMissesFeaturesError, match="[feat1, feat2]"): fitted_regressor.predict(valid_data.remove_columns(["feat1", "feat2", "target"])) + @pytest.mark.parametrize( + ("invalid_data", "expected_error", "expected_error_msg"), + [ + ( + Table( + { + "id": [1, 4], + "feat1": ["a", 5], + "feat2": [3, 6], + }, + ), + NonNumericColumnError, + ( + r"Tried to do a numerical operation on one or multiple non-numerical columns: \n\{'feat1'\}\nYou" + r" can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" + r" data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too" + r" many different values\nor is ordinal, you should use the LabelEncoder." + ), + ), + ( + Table( + { + "id": [1, 4], + "feat1": [None, 5], + "feat2": [3, 6], + }, + ), + MissingValuesColumnError, + ( + r"Tried to do an operation on one or multiple columns containing missing values: \n\{'feat1'\}\nYou" + r" can use the Imputer to replace the missing values based on different strategies.\nIf you want to" + r" remove the missing values entirely you can use the method" + r" `Table.remove_rows_with_missing_values`." + ), + ), + ( + Table( + { + "id": [], + "feat1": [], + "feat2": [], + }, + ), + DatasetMissesDataError, + r"Dataset contains no rows", + ), + ], + ids=["non-numerical data", "missing values in data", "no rows in data"], + ) def test_should_raise_on_invalid_data( self, regressor: Regressor, valid_data: TaggedTable, - invalid_data: TaggedTable, + invalid_data: Table, + expected_error: Any, + expected_error_msg: str, ) -> None: - fitted_regressor = regressor.fit(valid_data) - with pytest.raises(PredictionError): - fitted_regressor.predict(invalid_data.features) + regressor = regressor.fit(valid_data) + with pytest.raises(expected_error, match=expected_error_msg): + regressor.predict(invalid_data) @pytest.mark.parametrize("regressor", regressors(), ids=lambda x: x.__class__.__name__) diff --git a/tests/safeds/ml/classical/test_util_sklearn.py b/tests/safeds/ml/classical/test_util_sklearn.py index 46b88a9f2..741b53d7f 100644 --- a/tests/safeds/ml/classical/test_util_sklearn.py +++ b/tests/safeds/ml/classical/test_util_sklearn.py @@ -1,6 +1,10 @@ import warnings +from typing import Any +import pytest from safeds.data.tabular.containers import Table +from safeds.exceptions import LearningError, PredictionError +from safeds.ml.classical._util_sklearn import fit, predict from safeds.ml.classical.regression import LinearRegression @@ -17,3 +21,36 @@ def test_predict_should_not_warn_about_feature_names() -> None: with warnings.catch_warnings(): warnings.filterwarnings("error", message="X has feature names") fitted_model.predict(test_set) + + +class MLModelRaiseValueErrorOnFitAndPredict: + x, y = None, None + + def fit(self, x: Any, y: Any) -> None: + # The Linter does not want unnecessary parameters, so we just assign them to the class values + self.x = x + self.y = y + raise ValueError("Raise ValueError (LearningError) in fit for Test") + + def predict(self, x: Any) -> None: + # The Linter does not want unnecessary parameters, so we just assign it to the class value + self.x = x + raise ValueError("Raise ValueError (PredictionError) in predict for Test") + + +def test_should_raise_learning_error() -> None: + tagged_table = Table({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]}).tag_columns("col3") + with pytest.raises( + LearningError, + match=r"Error occurred while learning: Raise ValueError \(LearningError\) in fit for Test", + ): + fit(MLModelRaiseValueErrorOnFitAndPredict(), tagged_table) + + +def test_should_raise_prediction_error() -> None: + table = Table({"col1": [1, 2], "col2": [3, 4]}) + with pytest.raises( + PredictionError, + match=r"Error occurred while predicting: Raise ValueError \(PredictionError\) in predict for Test", + ): + predict(MLModelRaiseValueErrorOnFitAndPredict(), table, ["col1", "col2"], "col3")