From 7ab846d6a197143020ed1296e56a1cc6b0a66bf6 Mon Sep 17 00:00:00 2001 From: ParthivNaresh Date: Tue, 10 May 2022 13:48:26 -0400 Subject: [PATCH] Updated NoVarianceDataCheck to return only Warnings (#3506) * updated NoVarianceDataCheck * updated release notes * fix tests * Trigger Build * edited docs --- docs/source/release_notes.rst | 1 + .../user_guide/data_check_actions.ipynb | 52 +------------------ docs/source/user_guide/data_checks.ipynb | 20 ++----- evalml/data_checks/no_variance_data_check.py | 19 ++++--- .../data_checks_tests/test_data_checks.py | 8 +-- .../test_no_variance_data_check.py | 11 ++-- 6 files changed, 24 insertions(+), 87 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 5b820f007d..16a566f211 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -4,6 +4,7 @@ * Enhancements * Fixes * Changes + * Changed ``NoVarianceDataCheck`` to only output warnings :pr:`3506` * Updated ``roc_curve()`` and ``conf_matrix()`` to work with IntegerNullable and BooleanNullable types. :pr:`3465` * Changed ``ComponentGraph._transform_features`` to raise a ``PipelineError`` instead of a ``ValueError``. This is not a breaking change because ``PipelineError`` is a subclass of ``ValueError``. :pr:`3497` * Documentation Changes diff --git a/docs/source/user_guide/data_check_actions.ipynb b/docs/source/user_guide/data_check_actions.ipynb index 248f90ebe1..b8220974dc 100644 --- a/docs/source/user_guide/data_check_actions.ipynb +++ b/docs/source/user_guide/data_check_actions.ipynb @@ -99,9 +99,6 @@ "metadata": {}, "outputs": [], "source": [ - "# let's copy the datetime at row 1 for future use\n", - "date = X_train.iloc[1]['datetime']\n", - "\n", "# make row 1 all nan values\n", "X_train.iloc[1] = [None] * X_train.shape[1]\n", "\n", @@ -271,15 +268,9 @@ "\n", "# We address the errors by looking at the resulting dictionary errors listed\n", "\n", - "# first, let's address the `TARGET_HAS_NULL` error\n", + "# let's address the `TARGET_HAS_NULL` error\n", "y_train_no_errors.fillna(False, inplace=True)\n", "\n", - "# here, we address the `NO_VARIANCE` error \n", - "X_train_no_errors.drop(\"no_variance\", axis=1, inplace=True)\n", - "\n", - "# lastly, we address the `DATETIME_HAS_NAN` error with the date we had saved earlier\n", - "X_train_no_errors.iloc[1, 2] = date\n", - "\n", "# let's reinitialize the Woodwork DataTable\n", "X_train_no_errors.ww.init()\n", "X_train_no_errors.head()" @@ -301,47 +292,6 @@ "results_no_errors = search_iterative(X_train_no_errors, y_train_no_errors, problem_type='binary')\n", "results_no_errors" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Comparing removing only errors versus removing both warnings and errors\n", - "Let's see the differences in model performance when we remove only errors versus remove both warnings and errors. To do this, we compare the performance of the best pipelines on the validation data. Remember that in the search where we only address errors, we still have the `mostly_nulls` column present in the data, so we leave that column in the validation data for its respective search. We drop the other `no_variance` column from both searches.\n", - "\n", - "Additionally, we do some logical type setting since we had added additional noise to just the training data. This allows the data to be of the same types in both training and validation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# drop the no_variance column\n", - "X_valid.drop(\"no_variance\", axis=1, inplace=True)\n", - "\n", - "# logical type management\n", - "X_valid.ww.init(logical_types={\"customer_present\": \"Categorical\"})\n", - "y_valid = ww.init_series(y_valid, logical_type=\"Categorical\")\n", - "\n", - "best_pipeline_no_errors = results_no_errors[0].best_pipeline\n", - "print(\"Only dropping errors:\", best_pipeline_no_errors.score(X_valid, y_valid, [\"Log Loss Binary\"]), \"\\n\")\n", - "\n", - "# drop the mostly_nulls column and reinitialize the DataTable\n", - "X_valid.drop(\"mostly_nulls\", axis=1, inplace=True)\n", - "X_valid.ww.init()\n", - "\n", - "best_pipeline_clean = results_cleaned[0].best_pipeline\n", - "print(\"Addressing all actions:\", best_pipeline_clean.score(X_valid, y_valid, [\"Log Loss Binary\"]), \"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can compare the differences in model performance when we address all action items (warnings and errors) in comparison to when we only address errors. While it isn't guaranteed that addressing all actions will always have better performance, we do recommend doing so since we only raise these issues when we believe the features have problems that could negatively impact or not benefit the search." - ] } ], "metadata": { diff --git a/docs/source/user_guide/data_checks.ipynb b/docs/source/user_guide/data_checks.ipynb index f94d0a1a0e..c2a5d55222 100644 --- a/docs/source/user_guide/data_checks.ipynb +++ b/docs/source/user_guide/data_checks.ipynb @@ -100,14 +100,10 @@ "no_variance_data_check = NoVarianceDataCheck()\n", "messages = no_variance_data_check.validate(X, y)\n", "\n", - "errors = [message for message in messages if message['level'] == 'error']\n", "warnings = [message for message in messages if message['level'] == 'warning']\n", "\n", "for warning in warnings:\n", - " print(\"Warning:\", warning['message'])\n", - "\n", - "for error in errors:\n", - " print(\"Error:\", error['message'])" + " print(\"Warning:\", warning['message'])" ] }, { @@ -133,14 +129,10 @@ "no_variance_data_check = NoVarianceDataCheck(count_nan_as_value=True)\n", "messages = no_variance_data_check.validate(X, y)\n", "\n", - "errors = [message for message in messages if message['level'] == 'error']\n", "warnings = [message for message in messages if message['level'] == 'warning']\n", "\n", "for warning in warnings:\n", - " print(\"Warning:\", warning['message'])\n", - "\n", - "for error in errors:\n", - " print(\"Error:\", error['message'])" + " print(\"Warning:\", warning['message'])" ] }, { @@ -643,7 +635,7 @@ "metadata": {}, "outputs": [], "source": [ - "from evalml.data_checks import NoVarianceDataCheck, DataCheckError, DataCheckWarning\n", + "from evalml.data_checks import NoVarianceDataCheck, DataCheckWarning\n", "\n", "X = pd.DataFrame({\"no var col\": [0, 0, 0],\n", " \"no var col with nan\": [1, np.nan, 1],\n", @@ -653,14 +645,10 @@ "no_variance_data_check = NoVarianceDataCheck(count_nan_as_value=True)\n", "messages = no_variance_data_check.validate(X, y)\n", "\n", - "errors = [message for message in messages if message['level'] == 'error']\n", "warnings = [message for message in messages if message['level'] == 'warning']\n", "\n", "for warning in warnings:\n", - " print(\"Warning:\", warning['message'])\n", - "\n", - "for error in errors:\n", - " print(\"Error:\", error['message'])" + " print(\"Warning:\", warning['message'])" ] }, { diff --git a/evalml/data_checks/no_variance_data_check.py b/evalml/data_checks/no_variance_data_check.py index 59e68886e8..2939408371 100644 --- a/evalml/data_checks/no_variance_data_check.py +++ b/evalml/data_checks/no_variance_data_check.py @@ -3,7 +3,6 @@ DataCheck, DataCheckActionCode, DataCheckActionOption, - DataCheckError, DataCheckMessageCode, DataCheckWarning, ) @@ -46,7 +45,7 @@ def validate(self, X, y=None): ... { ... "message": "'First_Column' has 1 unique value.", ... "data_check_name": "NoVarianceDataCheck", - ... "level": "error", + ... "level": "warning", ... "details": {"columns": ["First_Column"], "rows": None}, ... "code": "NO_VARIANCE", ... "action_options": [ @@ -61,7 +60,7 @@ def validate(self, X, y=None): ... { ... "message": "Y has 1 unique value.", ... "data_check_name": "NoVarianceDataCheck", - ... "level": "error", + ... "level": "warning", ... "details": {"columns": ["Y"], "rows": None}, ... "code": "NO_VARIANCE", ... "action_options": [] @@ -81,7 +80,7 @@ def validate(self, X, y=None): ... { ... "message": "Y has 0 unique values.", ... "data_check_name": "NoVarianceDataCheck", - ... "level": "error", + ... "level": "warning", ... "details": {"columns": ["Y"], "rows": None}, ... "code": "NO_VARIANCE_ZERO_UNIQUE", ... "action_options":[] @@ -96,7 +95,7 @@ def validate(self, X, y=None): ... { ... "message": "'First_Column' has 1 unique value.", ... "data_check_name": "NoVarianceDataCheck", - ... "level": "error", + ... "level": "warning", ... "details": {"columns": ["First_Column"], "rows": None}, ... "code": "NO_VARIANCE", ... "action_options": [ @@ -111,7 +110,7 @@ def validate(self, X, y=None): ... { ... "message": "Y has 1 unique value.", ... "data_check_name": "NoVarianceDataCheck", - ... "level": "error", + ... "level": "warning", ... "details": {"columns": ["Y"], "rows": None}, ... "code": "NO_VARIANCE", ... "action_options": [] @@ -176,7 +175,7 @@ def validate(self, X, y=None): two_unique_with_null_message = "{} has two unique values including nulls. Consider encoding the nulls for this column to be useful for machine learning." if zero_unique: messages.append( - DataCheckError( + DataCheckWarning( message=zero_unique_message.format( (", ").join(["'{}'".format(str(col)) for col in zero_unique]), ), @@ -194,7 +193,7 @@ def validate(self, X, y=None): ) if one_unique: messages.append( - DataCheckError( + DataCheckWarning( message=one_unique_message.format( (", ").join(["'{}'".format(str(col)) for col in one_unique]), ), @@ -244,7 +243,7 @@ def validate(self, X, y=None): if y_unique_count == 0: messages.append( - DataCheckError( + DataCheckWarning( message=zero_unique_message.format(y_name), data_check_name=self.name, message_code=DataCheckMessageCode.NO_VARIANCE_ZERO_UNIQUE, @@ -254,7 +253,7 @@ def validate(self, X, y=None): elif y_unique_count == 1: messages.append( - DataCheckError( + DataCheckWarning( message=one_unique_message.format(y_name), data_check_name=self.name, message_code=DataCheckMessageCode.NO_VARIANCE, diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py index c953722b73..f4a5c8ddd7 100644 --- a/evalml/tests/data_checks_tests/test_data_checks.py +++ b/evalml/tests/data_checks_tests/test_data_checks.py @@ -237,7 +237,7 @@ def get_expected_messages(problem_type): ) ], ).to_dict(), - DataCheckError( + DataCheckWarning( message="'all_null', 'also_all_null' has 0 unique values.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE_ZERO_UNIQUE, @@ -250,7 +250,7 @@ def get_expected_messages(problem_type): ) ], ).to_dict(), - DataCheckError( + DataCheckWarning( message="'lots_of_null' has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, @@ -398,7 +398,7 @@ def test_default_data_checks_regression(input_type, data_checks_input_dataframe) == expected[:3] + expected[4:6] + [ - DataCheckError( + DataCheckWarning( message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, @@ -490,7 +490,7 @@ def __eq__(self, series_2): ) ], ).to_dict(), - DataCheckError( + DataCheckWarning( message="'all_null', 'also_all_null' has 0 unique values.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE_ZERO_UNIQUE, diff --git a/evalml/tests/data_checks_tests/test_no_variance_data_check.py b/evalml/tests/data_checks_tests/test_no_variance_data_check.py index f74807a97d..cba203f7c4 100644 --- a/evalml/tests/data_checks_tests/test_no_variance_data_check.py +++ b/evalml/tests/data_checks_tests/test_no_variance_data_check.py @@ -6,7 +6,6 @@ from evalml.data_checks import ( DataCheckActionCode, DataCheckActionOption, - DataCheckError, DataCheckMessageCode, DataCheckWarning, NoVarianceDataCheck, @@ -42,27 +41,27 @@ data_check_name=no_variance_data_check_name, metadata={"columns": ["feature"]}, ) -feature_0_unique = DataCheckError( +feature_0_unique = DataCheckWarning( message="'feature' has 0 unique values.", data_check_name=no_variance_data_check_name, message_code=DataCheckMessageCode.NO_VARIANCE_ZERO_UNIQUE, details={"columns": ["feature"]}, action_options=[drop_feature_action_option], ).to_dict() -feature_1_unique = DataCheckError( +feature_1_unique = DataCheckWarning( message="'feature' has 1 unique value.", data_check_name=no_variance_data_check_name, message_code=DataCheckMessageCode.NO_VARIANCE, details={"columns": ["feature"]}, action_options=[drop_feature_action_option], ).to_dict() -labels_0_unique = DataCheckError( +labels_0_unique = DataCheckWarning( message="Y has 0 unique values.", data_check_name=no_variance_data_check_name, message_code=DataCheckMessageCode.NO_VARIANCE_ZERO_UNIQUE, details={"columns": ["Y"]}, ).to_dict() -labels_1_unique = DataCheckError( +labels_1_unique = DataCheckWarning( message="Y has 1 unique value.", data_check_name=no_variance_data_check_name, message_code=DataCheckMessageCode.NO_VARIANCE, @@ -149,7 +148,7 @@ all_null_y_with_name, False, [ - DataCheckError( + DataCheckWarning( message="Labels has 0 unique values.", data_check_name=no_variance_data_check_name, message_code=DataCheckMessageCode.NO_VARIANCE_ZERO_UNIQUE,