From 03e129d633415595caabed7b5b688fda67166756 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 24 Aug 2021 17:51:17 +0100 Subject: [PATCH 01/36] release notes --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 26e820eb63..d56dede835 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -3,6 +3,7 @@ Release Notes **Future Releases** * Enhancements * Removed SVM "linear" and "precomputed" kernel hyperparameter options, and improved default parameters :pr:`2651` + * Updated to support Woodwork 0.6.0 :pr:`` * Fixes * Changes * Removed ``LinearRegressor`` from the list of default ``AutoMLSearch`` estimators due to poor performance :pr:`2660` From 0dac5a043778135d0ba313c7269774742637801a Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 24 Aug 2021 17:57:33 +0100 Subject: [PATCH 02/36] Upgrade woodwork versions --- core-requirements.txt | 2 +- docs/source/release_notes.rst | 2 +- .../dependency_update_check/latest_dependency_versions.txt | 2 +- .../tests/dependency_update_check/minimum_core_requirements.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core-requirements.txt b/core-requirements.txt index e0942fe43d..7155fc8973 100644 --- a/core-requirements.txt +++ b/core-requirements.txt @@ -12,7 +12,7 @@ psutil>=5.6.6 requirements-parser>=0.2.0 shap>=0.36.0 texttable>=1.6.2 -woodwork==0.5.1 +woodwork==0.6.0 dask>=2.12.0 featuretools>=0.26.1 nlp-primitives>=1.1.0 diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index d56dede835..58773bee92 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -3,7 +3,7 @@ Release Notes **Future Releases** * Enhancements * Removed SVM "linear" and "precomputed" kernel hyperparameter options, and improved default parameters :pr:`2651` - * Updated to support Woodwork 0.6.0 :pr:`` + * Updated to support Woodwork 0.6.0 :pr:`2690` * Fixes * Changes * Removed ``LinearRegressor`` from the list of default ``AutoMLSearch`` estimators due to poor performance :pr:`2660` diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt index ee85b1e26e..9fe6b759ae 100644 --- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt +++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt @@ -29,5 +29,5 @@ shap==0.39.0 sktime==0.7.0 statsmodels==0.12.2 texttable==1.6.4 -woodwork==0.5.1 +woodwork==0.6.0 xgboost==1.4.2 diff --git a/evalml/tests/dependency_update_check/minimum_core_requirements.txt b/evalml/tests/dependency_update_check/minimum_core_requirements.txt index 528d357668..3fecab4430 100644 --- a/evalml/tests/dependency_update_check/minimum_core_requirements.txt +++ b/evalml/tests/dependency_update_check/minimum_core_requirements.txt @@ -12,7 +12,7 @@ psutil==5.6.6 requirements-parser==0.2.0 shap==0.36.0 texttable==1.6.2 -woodwork==0.5.1 +woodwork==0.6.0 dask==2.12.0 featuretools==0.26.1 nlp-primitives==1.1.0 From cdcea2c2e30af024b9d46e95806460c6f2c95ec4 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Wed, 25 Aug 2021 13:17:04 +0100 Subject: [PATCH 03/36] extend length of columns to identify as categorical --- .../test_regression.py | 6 ++--- .../pipeline_tests/test_pipeline_utils.py | 22 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py index d11f1ad595..cd83f08b7d 100644 --- a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py +++ b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import pytest @@ -85,11 +86,10 @@ def test_woodwork_regression_pipeline(diabetes_local, linear_regression_pipeline def test_custom_indices(): X = pd.DataFrame( - {"a": ["a", "b", "a", "a", "a", "c", "c", "c"], "b": [0, 1, 1, 1, 1, 1, 0, 1]} + {"a": ["a", "b", "a", "a", "a", "c", "c", "c"]*3, "b": [0, 1, 1, 1, 1, 1, 0, 1]*3} ) - y = pd.Series([0, 0, 0, 1, 0, 1, 0, 0], index=[7, 2, 1, 4, 5, 3, 6, 8]) + y = pd.Series([0, 0, 0, 1, 0, 1, 0, 0]*3, index=np.random.choice(24, 24, replace=False)) x1, x2, y1, y2 = split_data(X, y, problem_type="regression") - pipeline = RegressionPipeline( component_graph=["Imputer", "One Hot Encoder", "Linear Regressor"], parameters={}, diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 84149a115e..3198dc9c2e 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -47,10 +47,10 @@ def _get_test_data_from_configuration( ): X_all = pd.DataFrame( { - "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - "numerical": range(7), - "categorical": ["a", "b", "a", "c", "c", "a", "b"], - "dates": pd.date_range("2000-02-03", periods=7, freq="W"), + "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]*2, + "numerical": range(14), + "categorical": ["a", "b", "a", "b", "b", "a", "b"]*2, + "dates": pd.date_range("2000-02-03", periods=14, freq="W"), "text": [ "this is a string", "this is another string", @@ -59,7 +59,7 @@ def _get_test_data_from_configuration( "cats are gr8", "hello world", "evalml is gr8", - ], + ]*2, "email": [ "abalone_0@gmail.com", "AbaloneRings@yahoo.com", @@ -68,7 +68,7 @@ def _get_test_data_from_configuration( "fooEMAIL@email.org", "evalml@evalml.org", "evalml@alteryx.org", - ], + ]*2, "url": [ "https://evalml.alteryx.com/en/stable/", "https://woodwork.alteryx.com/en/stable/guides/statistical_insights.html", @@ -77,17 +77,17 @@ def _get_test_data_from_configuration( "https://www.evalml.alteryx.com/en/stable/demos/text_input.html", "https://github.com/alteryx/evalml", "https://github.com/alteryx/featuretools", - ], + ]*2, } ) - y = pd.Series([0, 0, 1, 0, 0, 1, 1]) + y = pd.Series([0, 0, 1, 0, 0, 1, 1]*2) if problem_type == ProblemTypes.MULTICLASS: - y = pd.Series([0, 2, 1, 2, 0, 2, 1]) + y = pd.Series([0, 2, 1, 2, 0, 2, 1]*2) elif is_regression(problem_type): if lognormal_distribution: - y = pd.Series([1, 1, 1, 2, 3, 6, 9]) + y = pd.Series([1, 1, 1, 2, 3, 6, 9]*2) else: - y = pd.Series([1, 2, 3, 3, 3, 4, 5]) + y = pd.Series([1, 2, 3, 3, 3, 4, 5]*2) X = X_all[column_names] if input_type == "ww": From ab4d80e24cb2253f4697e944b705f0cf7cf081e5 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Wed, 25 Aug 2021 16:41:03 +0100 Subject: [PATCH 04/36] data checks updated --- evalml/tests/data_checks_tests/test_data_checks.py | 2 +- .../data_checks_tests/test_id_columns_data_check.py | 1 + .../test_invalid_targets_data_check.py | 1 + .../test_multicollinearity_data_check.py | 3 ++- .../test_target_leakage_data_check.py | 10 +--------- evalml/utils/woodwork_utils.py | 2 +- 6 files changed, 7 insertions(+), 12 deletions(-) diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py index 1c495ee5cb..529fc66d68 100644 --- a/evalml/tests/data_checks_tests/test_data_checks.py +++ b/evalml/tests/data_checks_tests/test_data_checks.py @@ -340,7 +340,7 @@ def test_default_data_checks_regression(input_type): X["nan_dt_col"][0] = None y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) - X.ww.init(logical_types={"natural_language_nan": "NaturalLanguage"}) + X.ww.init(logical_types={"lots_of_null": "categorical", "natural_language_nan": "NaturalLanguage"}) if input_type == "ww": y = ww.init_series(y) y_no_variance = ww.init_series(y_no_variance) diff --git a/evalml/tests/data_checks_tests/test_id_columns_data_check.py b/evalml/tests/data_checks_tests/test_id_columns_data_check.py index eb550fb85f..f15a3c6f45 100644 --- a/evalml/tests/data_checks_tests/test_id_columns_data_check.py +++ b/evalml/tests/data_checks_tests/test_id_columns_data_check.py @@ -135,6 +135,7 @@ def test_id_columns_strings(): "col_6": [0.1, 0.2, 0.3, 0.4], } X = pd.DataFrame.from_dict(X_dict) + X.ww.init(logical_types={"col_1_id": "categorical", "col_2": "categorical", "Id": "categorical", "col_5": "categorical"}) id_cols_check = IDColumnsDataCheck(id_threshold=0.95) assert id_cols_check.validate(X) == { "warnings": [ diff --git a/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py b/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py index ce5ebfb20c..b5d8453952 100644 --- a/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py +++ b/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py @@ -231,6 +231,7 @@ def test_invalid_target_data_input_formats(): # test Woodwork y = pd.Series([None, None, None, 0]) X = pd.DataFrame({"col": range(len(y))}) + messages = invalid_targets_check.validate(X, y) assert messages == expected diff --git a/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py b/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py index b086ae74a2..23961985e4 100644 --- a/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py +++ b/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py @@ -86,7 +86,8 @@ def test_multicollinearity_nonnumeric_cols(data_type, make_data_type): "col_6": [1, 1, 2, 3, 1], } ) - X = make_data_type(data_type, X) + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical", "col_4": "categorical", "col_5": "categorical"}) + multi_check = MulticollinearityDataCheck(threshold=0.9) assert multi_check.validate(X) == { "warnings": [ diff --git a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py index 691e171dde..828a7e186d 100644 --- a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py +++ b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py @@ -247,6 +247,7 @@ def test_target_leakage_types(): X["d"] = ~y X["e"] = [0, 0, 0, 0] y = y.astype(bool) + X.ww.init(logical_types={"a": "categorical"}) expected = { "warnings": [ @@ -398,12 +399,6 @@ def test_target_leakage_regression(): message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "c"}, ).to_dict(), - DataCheckWarning( - message="Column 'e' is 80.0% or more correlated with the target", - data_check_name=target_leakage_data_check_name, - message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"column": "e"}, - ).to_dict(), ], "errors": [], "actions": [ @@ -416,9 +411,6 @@ def test_target_leakage_regression(): DataCheckAction( DataCheckActionCode.DROP_COL, metadata={"column": "c"} ).to_dict(), - DataCheckAction( - DataCheckActionCode.DROP_COL, metadata={"column": "e"} - ).to_dict(), ], } diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py index 1adb2269cc..2b93493487 100644 --- a/evalml/utils/woodwork_utils.py +++ b/evalml/utils/woodwork_utils.py @@ -100,8 +100,8 @@ def is_column_unknown(data, col): return convert_all_nan_unknown_to_double(data) if isinstance(data, pd.Series): + data = data.replace(pd.NA, np.nan) if all(data.isna()): - data = data.replace(pd.NA, np.nan) feature_types = "Double" return ww.init_series(data, logical_type=feature_types) else: From d2d045e283b2bf0a82d584b936d9f1f6d01345c2 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Wed, 25 Aug 2021 18:55:10 +0100 Subject: [PATCH 05/36] more component tests --- .../test_column_selector_transformers.py | 3 + evalml/tests/component_tests/test_imputer.py | 144 +++++++++--------- 2 files changed, 75 insertions(+), 72 deletions(-) diff --git a/evalml/tests/component_tests/test_column_selector_transformers.py b/evalml/tests/component_tests/test_column_selector_transformers.py index f1eafae6f0..5543470399 100644 --- a/evalml/tests/component_tests/test_column_selector_transformers.py +++ b/evalml/tests/component_tests/test_column_selector_transformers.py @@ -118,6 +118,7 @@ def test_column_transformer_transform(class_to_test, checking_functions): if class_to_test is SelectByType: transformer = class_to_test(column_types=["categorical", "Boolean", "Integer"]) + X.ww.init(logical_types={"one": "categorical"}) else: transformer = class_to_test(columns=list(X.columns)) assert check4(X, transformer.transform(X)) @@ -175,6 +176,7 @@ def test_column_transformer_fit_transform(class_to_test, checking_functions): assert check2(X, class_to_test(columns=["one"]).fit_transform(X)) if class_to_test is SelectByType: + X.ww.init(logical_types={"one": "categorical"}) assert check3( X, class_to_test( @@ -254,6 +256,7 @@ def test_typeortag_column_transformer_ww_logical_and_semantic_types(): "four": [4.0, 2.3, 6.5, 2.6], } ) + X.ww.init(logical_types={"one": "categorical"}) transformer = SelectByType(column_types=[ww.logical_types.Age]) with pytest.raises(ValueError, match="not found in input data"): diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py index e7113ec8be..cd75b4044a 100644 --- a/evalml/tests/component_tests/test_imputer.py +++ b/evalml/tests/component_tests/test_imputer.py @@ -19,24 +19,24 @@ def imputer_test_data(): return pd.DataFrame( { "categorical col": pd.Series( - ["zero", "one", "two", "zero", "three"], dtype="category" + ["zero", "one", "two", "zero", "two"]*4, dtype="category" ), - "int col": [0, 1, 2, 0, 3], - "object col": ["b", "b", "a", "c", "d"], - "float col": [0.0, 1.0, 0.0, -2.0, 5.0], - "bool col": [True, False, False, True, True], + "int col": [0, 1, 2, 0, 3]*4, + "object col": ["b", "b", "a", "c", "d"]*4, + "float col": [0.0, 1.0, 0.0, -2.0, 5.0]*4, + "bool col": [True, False, False, True, True]*4, "categorical with nan": pd.Series( - [np.nan, "1", np.nan, "0", "3"], dtype="category" + [np.nan, "1", "0", "0", "3"]*4, dtype="category" ), - "int with nan": [np.nan, 1, 0, 0, 1], - "float with nan": [0.0, 1.0, np.nan, -1.0, 0.0], - "object with nan": ["b", "b", np.nan, "c", np.nan], + "int with nan": [np.nan, 1, 0, 0, 1]*4, + "float with nan": [0.0, 1.0, np.nan, -1.0, 0.0]*4, + "object with nan": ["b", "b", np.nan, "c", np.nan]*4, "bool col with nan": pd.Series( - [True, np.nan, False, np.nan, True], dtype="category" + [True, np.nan, False, np.nan, True]*4, dtype="category" ), - "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan], + "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan]*4, "all nan cat": pd.Series( - [np.nan, np.nan, np.nan, np.nan, np.nan], dtype="category" + [np.nan, np.nan, np.nan, np.nan, np.nan]*4, dtype="category" ), } ) @@ -91,16 +91,16 @@ def test_numeric_only_input(imputer_test_data): X = imputer_test_data[ ["int col", "float col", "int with nan", "float with nan", "all nan"] ] - y = pd.Series([0, 0, 1, 0, 1]) + y = pd.Series([0, 0, 1, 0, 1]*4) imputer = Imputer(numeric_impute_strategy="median") imputer.fit(X, y) transformed = imputer.transform(X, y) expected = pd.DataFrame( { - "int col": [0, 1, 2, 0, 3], - "float col": [0.0, 1.0, 0.0, -2.0, 5.0], - "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0], - "float with nan": [0.0, 1.0, 0, -1.0, 0.0], + "int col": [0, 1, 2, 0, 3]*4, + "float col": [0.0, 1.0, 0.0, -2.0, 5.0]*4, + "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0]*4, + "float with nan": [0.0, 1.0, 0, -1.0, 0.0]*4, } ) assert_frame_equal(transformed, expected, check_dtype=False) @@ -122,29 +122,29 @@ def test_categorical_only_input(imputer_test_data): "all nan cat", ] ] - y = pd.Series([0, 0, 1, 0, 1]) - imputer = Imputer() - imputer.fit(X, y) - transformed = imputer.transform(X, y) + y = pd.Series([0, 0, 1, 0, 1]*4) + expected = pd.DataFrame( { "categorical col": pd.Series( - ["zero", "one", "two", "zero", "three"], dtype="category" + ["zero", "one", "two", "zero", "two"]*4, dtype="category" ), - "object col": pd.Series(["b", "b", "a", "c", "d"], dtype="category"), - "bool col": [True, False, False, True, True], + "object col": pd.Series(["b", "b", "a", "c", "d"]*4, dtype="category"), + "bool col": [True, False, False, True, True]*4, "categorical with nan": pd.Series( - ["0", "1", "0", "0", "3"], dtype="category" + ["0", "1", "0", "0", "3"]*4, dtype="category" ), - "object with nan": pd.Series(["b", "b", "b", "c", "b"], dtype="category"), + "object with nan": pd.Series(["b", "b", "b", "c", "b"]*4, dtype="category"), "bool col with nan": pd.Series( - [True, True, False, True, True], dtype="category" + [True, True, False, True, True]*4, dtype="category" ), } ) imputer = Imputer() + imputer.fit(X, y) transformed = imputer.fit_transform(X, y) + assert_frame_equal(transformed, expected, check_dtype=False) @@ -157,20 +157,20 @@ def test_categorical_and_numeric_input(imputer_test_data): expected = pd.DataFrame( { "categorical col": pd.Series( - ["zero", "one", "two", "zero", "three"], dtype="category" + ["zero", "one", "two", "zero", "two"]*4, dtype="category" ), - "int col": [0, 1, 2, 0, 3], - "object col": pd.Series(["b", "b", "a", "c", "d"], dtype="category"), - "float col": [0.0, 1.0, 0.0, -2.0, 5.0], - "bool col": [True, False, False, True, True], + "int col": [0, 1, 2, 0, 3]*4, + "object col": pd.Series(["b", "b", "a", "c", "d"]*4, dtype="category"), + "float col": [0.0, 1.0, 0.0, -2.0, 5.0]*4, + "bool col": [True, False, False, True, True]*4, "categorical with nan": pd.Series( - ["0", "1", "0", "0", "3"], dtype="category" + ["0", "1", "0", "0", "3"]*4, dtype="category" ), - "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0], - "float with nan": [0.0, 1.0, 0, -1.0, 0.0], - "object with nan": pd.Series(["b", "b", "b", "c", "b"], dtype="category"), + "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0]*4, + "float with nan": [0.0, 1.0, 0, -1.0, 0.0]*4, + "object with nan": pd.Series(["b", "b", "b", "c", "b"]*4, dtype="category"), "bool col with nan": pd.Series( - [True, True, False, True, True], dtype="category" + [True, True, False, True, True]*4, dtype="category" ), } ) @@ -183,7 +183,7 @@ def test_categorical_and_numeric_input(imputer_test_data): def test_drop_all_columns(imputer_test_data): X = imputer_test_data[["all nan cat", "all nan"]] - y = pd.Series([0, 0, 1, 0, 1]) + y = pd.Series([0, 0, 1, 0, 1]*4) X.ww.init() imputer = Imputer() imputer.fit(X, y) @@ -288,7 +288,7 @@ def test_imputer_fill_value(imputer_test_data): "bool col with nan", ] ] - y = pd.Series([0, 0, 1, 0, 1]) + y = pd.Series([0, 0, 1, 0, 1]*4) imputer = Imputer( categorical_impute_strategy="constant", numeric_impute_strategy="constant", @@ -299,16 +299,16 @@ def test_imputer_fill_value(imputer_test_data): transformed = imputer.transform(X, y) expected = pd.DataFrame( { - "int with nan": [-1, 1, 0, 0, 1], + "int with nan": [-1, 1, 0, 0, 1]*4, "categorical with nan": pd.Series( - ["fill", "1", "fill", "0", "3"], dtype="category" + ["fill", "1", "0", "0", "3"]*4, dtype="category" ), - "float with nan": [0.0, 1.0, -1, -1.0, 0.0], + "float with nan": [0.0, 1.0, -1, -1.0, 0.0]*4, "object with nan": pd.Series( - ["b", "b", "fill", "c", "fill"], dtype="category" + ["b", "b", "fill", "c", "fill"]*4, dtype="category" ), "bool col with nan": pd.Series( - [True, "fill", False, "fill", True], dtype="category" + [True, "fill", False, "fill", True]*4, dtype="category" ), } ) @@ -326,7 +326,7 @@ def test_imputer_fill_value(imputer_test_data): def test_imputer_no_nans(imputer_test_data): X = imputer_test_data[["categorical col", "object col", "bool col"]] - y = pd.Series([0, 0, 1, 0, 1]) + y = pd.Series([0, 0, 1, 0, 1]*4) imputer = Imputer( categorical_impute_strategy="constant", numeric_impute_strategy="constant", @@ -338,10 +338,10 @@ def test_imputer_no_nans(imputer_test_data): expected = pd.DataFrame( { "categorical col": pd.Series( - ["zero", "one", "two", "zero", "three"], dtype="category" + ["zero", "one", "two", "zero", "two"]*4, dtype="category" ), - "object col": pd.Series(["b", "b", "a", "c", "d"], dtype="category"), - "bool col": [True, False, False, True, True], + "object col": pd.Series(["b", "b", "a", "c", "d"]*4, dtype="category"), + "bool col": [True, False, False, True, True]*4, } ) assert_frame_equal(transformed, expected, check_dtype=False) @@ -359,25 +359,25 @@ def test_imputer_no_nans(imputer_test_data): def test_imputer_with_none(): X = pd.DataFrame( { - "int with None": [1, 0, 5, None], - "float with None": [0.1, 0.0, 0.5, None], - "category with None": pd.Series(["b", "a", "a", None], dtype="category"), - "boolean with None": pd.Series([True, None, False, True]), - "object with None": ["b", "a", "a", None], - "all None": [None, None, None, None], + "int with None": [1, 0, 5, None]*4, + "float with None": [0.1, 0.0, 0.5, None]*4, + "category with None": pd.Series(["b", "a", "a", None]*4, dtype="category"), + "boolean with None": pd.Series([True, None, False, True]*4), + "object with None": ["b", "a", "a", None]*4, + "all None": [None, None, None, None]*4, } ) - y = pd.Series([0, 0, 1, 0, 1]) + y = pd.Series([0, 0, 1, 0, 1]*4) imputer = Imputer() imputer.fit(X, y) transformed = imputer.transform(X, y) expected = pd.DataFrame( { - "int with None": [1, 0, 5, 2], - "float with None": [0.1, 0.0, 0.5, 0.2], - "category with None": pd.Series(["b", "a", "a", "a"], dtype="category"), - "boolean with None": pd.Series([True, True, False, True], dtype="category"), - "object with None": pd.Series(["b", "a", "a", "a"], dtype="category"), + "int with None": [1, 0, 5, 2]*4, + "float with None": [0.1, 0.0, 0.5, 0.2]*4, + "category with None": pd.Series(["b", "a", "a", "a"]*4, dtype="category"), + "boolean with None": pd.Series([True, True, False, True]*4, dtype="category"), + "object with None": pd.Series(["b", "a", "a", "a"]*4, dtype="category"), } ) assert_frame_equal(expected, transformed, check_dtype=False) @@ -403,9 +403,9 @@ def test_imputer_all_bool_return_original(data_type, make_data_type): @pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_imputer_bool_dtype_object(data_type, make_data_type): - X = pd.DataFrame([True, np.nan, False, np.nan, True]) - y = pd.Series([1, 0, 0, 1, 0]) - X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype="category") + X = pd.DataFrame([True, np.nan, False, np.nan, True]*4) + y = pd.Series([1, 0, 0, 1, 0]*4) + X_expected_arr = pd.DataFrame([True, True, False, True, True]*4, dtype="category") X = make_data_type(data_type, X) y = make_data_type(data_type, y) imputer = Imputer() @@ -418,17 +418,17 @@ def test_imputer_bool_dtype_object(data_type, make_data_type): def test_imputer_multitype_with_one_bool(data_type, make_data_type): X_multi = pd.DataFrame( { - "bool with nan": pd.Series([True, np.nan, False, np.nan, False]), - "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), + "bool with nan": pd.Series([True, np.nan, False, np.nan, False]*4), + "bool no nan": pd.Series([False, False, False, False, True]*4, dtype=bool), } ) - y = pd.Series([1, 0, 0, 1, 0]) + y = pd.Series([1, 0, 0, 1, 0]*4) X_multi_expected_arr = pd.DataFrame( { "bool with nan": pd.Series( - [True, False, False, False, False], dtype="category" + [True, False, False, False, False]*4, dtype="category" ), - "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), + "bool no nan": pd.Series([False, False, False, False, True]*4, dtype=bool), } ) @@ -468,23 +468,23 @@ def test_imputer_int_preserved(): def test_imputer_bool_preserved(): - X = pd.DataFrame(pd.Series([True, False, True, np.nan])) + X = pd.DataFrame(pd.Series([True, False, True, np.nan]*4)) imputer = Imputer(categorical_impute_strategy="most_frequent") transformed = imputer.fit_transform(X) pd.testing.assert_frame_equal( transformed, - pd.DataFrame(pd.Series([True, False, True, True], dtype="category")), + pd.DataFrame(pd.Series([True, False, True, True]*4, dtype="category")), ) assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == { 0: Categorical } - X = pd.DataFrame(pd.Series([True, False, True, False])) + X = pd.DataFrame(pd.Series([True, False, True, False]*4)) imputer = Imputer(categorical_impute_strategy="most_frequent") transformed = imputer.fit_transform(X) pd.testing.assert_frame_equal( transformed, - pd.DataFrame(pd.Series([True, False, True, False])), + pd.DataFrame(pd.Series([True, False, True, False]*4)), check_dtype=False, ) assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == {0: Boolean} From 7f1f92a80ce614a29f5b14f42a4a2985921e05e8 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Wed, 25 Aug 2021 19:14:31 +0100 Subject: [PATCH 06/36] lgbm updates --- evalml/tests/component_tests/test_lgbm_classifier.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py index 1766024f0a..bd6dd0ba41 100644 --- a/evalml/tests/component_tests/test_lgbm_classifier.py +++ b/evalml/tests/component_tests/test_lgbm_classifier.py @@ -173,11 +173,11 @@ def test_correct_args(mock_predict, mock_predict_proba, X_y_binary): @patch("evalml.pipelines.components.estimators.estimator.Estimator.predict") def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary): X = pd.DataFrame( - {"feature_1": [0, 0, 1, 1, 0, 1], "feature_2": ["a", "a", "b", "b", "c", "c"]} + {"feature_1": [0, 0, 1, 1, 0, 1]*2, "feature_2": ["a", "a", "b", "b", "c", "c"]*2} ) - y = pd.Series([1, 1, 0, 0, 0, 1]) + y = pd.Series([1, 1, 0, 0, 0, 1]*2) X_expected = pd.DataFrame( - {0: [0, 0, 1, 1, 0, 1], 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]} + {0: [0, 0, 1, 1, 0, 1]*2, 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]*2} ) X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype("category") From eba1d4b9be7cbf65d171859c06cae1036eed891f Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 26 Aug 2021 11:30:57 +0100 Subject: [PATCH 07/36] no message --- evalml/utils/woodwork_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py index 2b93493487..1adb2269cc 100644 --- a/evalml/utils/woodwork_utils.py +++ b/evalml/utils/woodwork_utils.py @@ -100,8 +100,8 @@ def is_column_unknown(data, col): return convert_all_nan_unknown_to_double(data) if isinstance(data, pd.Series): - data = data.replace(pd.NA, np.nan) if all(data.isna()): + data = data.replace(pd.NA, np.nan) feature_types = "Double" return ww.init_series(data, logical_type=feature_types) else: From 22c79c16627a5576ea353a9a6a809c7a98cf265d Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 26 Aug 2021 15:00:05 +0100 Subject: [PATCH 08/36] model understanding updates --- .../test_explainers.py | 7 ++++++- .../test_force_plots.py | 2 ++ .../test_partial_dependence.py | 10 +++++----- .../test_permutation_importance.py | 1 + .../tests/pipeline_tests/test_component_graph.py | 15 ++++++++++----- 5 files changed, 24 insertions(+), 11 deletions(-) diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py index ccd7b3ab3e..2405e2cabd 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py @@ -1263,6 +1263,7 @@ def transform_y_for_problem_type(problem_type, y): @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) def test_categories_aggregated_linear_pipeline(pipeline_class, estimator, fraud_100): X, y = fraud_100 + X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) pipeline = pipeline_class( component_graph=[ @@ -1304,7 +1305,7 @@ def test_categories_aggregated_linear_pipeline(pipeline_class, estimator, fraud_ def test_categories_aggregated_text(pipeline_class, estimator, fraud_100): X, y = fraud_100 - X.ww.set_types(logical_types={"provider": "NaturalLanguage"}) + X.ww.set_types(logical_types={"provider": "NaturalLanguage", "currency": "categorical", "expiration_date": "categorical"}) component_graph = [ "Select Columns Transformer", "One Hot Encoder", @@ -1361,6 +1362,7 @@ def test_categories_aggregated_text(pipeline_class, estimator, fraud_100): @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100): X, y = fraud_100 + X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) pipeline = pipeline_class( component_graph=[ @@ -1415,6 +1417,7 @@ def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100): @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) def test_categories_aggregated_pca_dag(pipeline_class, estimator, fraud_100): X, y = fraud_100 + X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) component_graph = { "SelectNumeric": ["Select Columns Transformer", "X", "y"], @@ -1474,6 +1477,7 @@ def test_categories_aggregated_but_not_those_that_are_dropped( pipeline_class, estimator, fraud_100 ): X, y = fraud_100 + X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) component_graph = [ "Select Columns Transformer", @@ -1517,6 +1521,7 @@ def test_categories_aggregated_when_some_are_dropped( pipeline_class, estimator, fraud_100 ): X, y = fraud_100 + X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) component_graph = [ "Select Columns Transformer", diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py index 343a1ce97a..b08f8ccc26 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py @@ -217,6 +217,7 @@ def test_force_plot_regression( def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100): X, y = fraud_100 columns_to_select = ["datetime", "amount", "provider", "currency"] + X.ww.init(logical_types={"currency": "categorical"}) pipeline = pipeline_class( component_graph=[ @@ -250,6 +251,7 @@ def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100): def test_categories_aggregated_text(pipeline_class, estimator, fraud_100): X, y = fraud_100 columns_to_select = ["datetime", "amount", "provider", "currency"] + X.ww.init(logical_types={"currency": "categorical"}) X.ww.set_types(logical_types={"provider": "NaturalLanguage"}) component_graph = [ diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py index d057b496e1..6751b29a30 100644 --- a/evalml/tests/model_understanding_tests/test_partial_dependence.py +++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py @@ -135,15 +135,15 @@ def test_partial_dependence_with_non_numeric_columns( ): X = pd.DataFrame( { - "numeric": [1, 2, 3, 0], - "also numeric": [2, 3, 4, 1], - "string": ["a", "b", "a", "c"], - "also string": ["c", "b", "a", "d"], + "numeric": [1, 2, 3, 0]*4, + "also numeric": [2, 3, 4, 1]*4, + "string": ["a", "b", "a", "c"]*4, + "also string": ["c", "b", "a", "c"]*4, } ) if data_type == "ww": X.ww.init() - y = [0, 0.2, 1.4, 1] + y = [0, 0.2, 1.4, 1]*4 pipeline = linear_regression_pipeline_class( parameters={"Linear Regressor": {"n_jobs": 1}} ) diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py index 1833cf1b7a..dfbe0f1c35 100644 --- a/evalml/tests/model_understanding_tests/test_permutation_importance.py +++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py @@ -310,6 +310,7 @@ def test_fast_permutation_importance_matches_slow_output( "dependency not installed." ) X, y = fraud_100 + X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) if pipeline_class == LinearPipelineWithTextFeatures: X.ww.set_types(logical_types={"provider": "NaturalLanguage"}) diff --git a/evalml/tests/pipeline_tests/test_component_graph.py b/evalml/tests/pipeline_tests/test_component_graph.py index 5f99549e2a..4246d0a028 100644 --- a/evalml/tests/pipeline_tests/test_component_graph.py +++ b/evalml/tests/pipeline_tests/test_component_graph.py @@ -753,6 +753,8 @@ def test_computation_input_custom_index(index, example_graph): index=index, ) y = pd.Series([1, 2, 1, 2, 1]) + X.ww.init(logical_types={"categories": "categorical"}) + component_graph = ComponentGraph(example_graph) component_graph.instantiate({}) component_graph.fit(X, y) @@ -881,6 +883,7 @@ def test_input_feature_names(example_graph): } ) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) + X.ww.init(logical_types={"column_1": "categorical"}) component_graph = ComponentGraph(example_graph) component_graph.instantiate( @@ -945,7 +948,7 @@ def test_custom_input_feature_types(example_graph): } ) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) - X = infer_feature_types(X, {"column_2": "categorical"}) + X = infer_feature_types(X, {"column_1": "categorical", "column_2": "categorical"}) component_graph = ComponentGraph(example_graph) component_graph.instantiate( @@ -1015,7 +1018,7 @@ def test_component_graph_dataset_with_different_types(): y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) X = infer_feature_types( - X, {"column_2": "categorical", "column_5": "NaturalLanguage"} + X, {"column_1": "categorical", "column_2": "categorical", "column_5": "NaturalLanguage"} ) component_graph = ComponentGraph(graph) @@ -1181,7 +1184,7 @@ def test_component_graph_types_merge_mock(mock_rf_fit): ) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) # woodwork would infer this as boolean by default -- convert to a numeric type - X = infer_feature_types(X, {"column_3": "integer"}) + X = infer_feature_types(X, {"column_1": "categorical", "column_3": "integer"}) component_graph = ComponentGraph(graph) # we don't have feature type selectors defined yet, so in order for the above graph to work we have to @@ -1263,7 +1266,7 @@ def transform(self, X, y=None): y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) # woodwork would infer this as boolean by default -- convert to a numeric type - X.ww.init(semantic_tags={"address": "address"}) + X.ww.init(logical_types={"column_1": "categorical"}, semantic_tags={"address": "address"}) component_graph = ComponentGraph(graph) # we don't have feature type selectors defined yet, so in order for the above graph to work we have to @@ -1333,7 +1336,7 @@ def test_component_graph_types_merge(): X["column_5"] = X["column_4"] X["column_6"] = [42.0] * len(X) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) - X = infer_feature_types(X, {"column_5": "NaturalLanguage"}) + X = infer_feature_types(X, {"column_1": "categorical", "column_5": "NaturalLanguage"}) component_graph = ComponentGraph(graph) # we don't have feature type selectors defined yet, so in order for the above graph to work we have to @@ -1416,6 +1419,7 @@ def test_component_graph_dataset_with_target_imputer(): } ) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, np.nan]) + X = infer_feature_types(X, {"column_1": "categorical"}) graph = { "Target Imputer": [TargetImputer, "X", "y"], "OneHot": [OneHotEncoder, "Target Imputer.x", "Target Imputer.y"], @@ -1919,6 +1923,7 @@ def test_final_component_features_does_not_have_target(): } ) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) + X.ww.init(logical_types={"column_1": "categorical"}) cg = ComponentGraph( { From 7db57872b1153f8e8d20cd59d098332d176820b9 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Thu, 26 Aug 2021 20:43:20 +0100 Subject: [PATCH 09/36] imputer fixes --- .../component_tests/test_target_imputer.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/evalml/tests/component_tests/test_target_imputer.py b/evalml/tests/component_tests/test_target_imputer.py index 5a77c41db2..8a9ad5ce26 100644 --- a/evalml/tests/component_tests/test_target_imputer.py +++ b/evalml/tests/component_tests/test_target_imputer.py @@ -57,11 +57,11 @@ def test_target_imputer_mean(): (None, pd.Series([np.nan, 0, 5]), pd.Series([0, 0, 5])), ( None, - pd.Series([np.nan, "a", "b"]), - pd.Series(["missing_value", "a", "b"]).astype("category"), + pd.Series([np.nan, "a", "b"]*5), + pd.Series(["missing_value", "a", "b"]*5).astype("category"), ), (3, pd.Series([np.nan, 0, 5]), pd.Series([3, 0, 5])), - (3, pd.Series([np.nan, "a", "b"]), pd.Series([3, "a", "b"]).astype("category")), + (3, pd.Series([np.nan, "a", "b"]*5), pd.Series([3, "a", "b"]*5).astype("category")), ], ) def test_target_imputer_constant(fill_value, y, y_expected): @@ -71,9 +71,9 @@ def test_target_imputer_constant(fill_value, y, y_expected): def test_target_imputer_most_frequent(): - y = pd.Series([np.nan, "a", "b"]) + y = pd.Series([np.nan, "a", "b"]*5) imputer = TargetImputer(impute_strategy="most_frequent") - y_expected = pd.Series(["a", "a", "b"]).astype("category") + y_expected = pd.Series(["a", "a", "b"]*5).astype("category") _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t, check_dtype=False) @@ -85,7 +85,7 @@ def test_target_imputer_most_frequent(): def test_target_imputer_col_with_non_numeric_with_numeric_strategy(): - y = pd.Series([np.nan, "a", "b"]) + y = pd.Series([np.nan, "a", "b"]*5) imputer = TargetImputer(impute_strategy="mean") with pytest.raises( ValueError, match="Cannot use mean strategy with non-numeric data" @@ -190,16 +190,16 @@ def test_target_imputer_with_none(y, y_expected): "y, y_expected", [ ( - pd.Series(["b", "a", "a", None], dtype="category"), - pd.Series(["b", "a", "a", "a"], dtype="category"), + pd.Series(["b", "a", "a", None]*4, dtype="category"), + pd.Series(["b", "a", "a", "a"]*4, dtype="category"), ), ( pd.Series([True, None, False, True], dtype="category"), pd.Series([True, True, False, True], dtype="category"), ), ( - pd.Series(["b", "a", "a", None]), - pd.Series(["b", "a", "a", "a"], dtype="category"), + pd.Series(["b", "a", "a", None]*4), + pd.Series(["b", "a", "a", "a"]*4, dtype="category"), ), ], ) From 240655d1b6107c7934a4e02b4cb5ee3ae9da8501 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 27 Aug 2021 11:22:14 +0100 Subject: [PATCH 10/36] test target encoder and invalid target data checks --- .../component_tests/test_target_encoder.py | 17 ++++++++++------- .../test_invalid_targets_data_check.py | 4 ++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/evalml/tests/component_tests/test_target_encoder.py b/evalml/tests/component_tests/test_target_encoder.py index a7c30a2ff2..fd7c5315e1 100644 --- a/evalml/tests/component_tests/test_target_encoder.py +++ b/evalml/tests/component_tests/test_target_encoder.py @@ -70,6 +70,7 @@ def test_null_values_in_dataframe(): "col_3": ["a", "a", "a", "a", "a"], } ) + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) y = pd.Series([0, 1, 1, 1, 0]) encoder = TargetEncoder(handle_missing="value") encoder.fit(X, y) @@ -116,13 +117,13 @@ def test_null_values_in_dataframe(): def test_cols(): X = pd.DataFrame( { - "col_1": [1, 2, 1, 1, 2], - "col_2": ["2", "1", "1", "1", "1"], - "col_3": ["a", "a", "a", "a", "a"], + "col_1": [1, 2, 1, 1, 2]*2, + "col_2": ["2", "1", "1", "1", "1"]*2, + "col_3": ["a", "a", "a", "a", "a"]*2, } ) X_expected = X.astype({"col_1": "int64", "col_2": "category", "col_3": "category"}) - y = pd.Series([0, 1, 1, 1, 0]) + y = pd.Series([0, 1, 1, 1, 0]*2) encoder = TargetEncoder(cols=[]) encoder.fit(X, y) X_t = encoder.transform(X) @@ -133,9 +134,9 @@ def test_cols(): X_t = encoder.transform(X) X_expected = pd.DataFrame( { - "col_1": pd.Series([1, 2, 1, 1, 2], dtype="int64"), - "col_2": [0.60000, 0.742886, 0.742886, 0.742886, 0.742886], - "col_3": pd.Series(["a", "a", "a", "a", "a"], dtype="category"), + "col_1": pd.Series([1, 2, 1, 1, 2]*2, dtype="int64"), + "col_2": [0.161365, 0.749863, 0.749863, 0.749863, 0.749863]*2, + "col_3": pd.Series(["a", "a", "a", "a", "a"]*2, dtype="category"), } ) assert_frame_equal(X_expected, X_t, check_less_precise=True) @@ -157,6 +158,7 @@ def test_transform(): "col_3": ["a", "a", "a", "b", "a"], } ) + X.ww.init(logical_types={"col_2": "categorical", "col_3": "categorical"}) y = pd.Series([0, 1, 1, 1, 0]) encoder = TargetEncoder() encoder.fit(X, y) @@ -180,6 +182,7 @@ def test_smoothing(): "col_3": ["a", "a", "a", "a", "b"], } ) + X.ww.init(logical_types={"col_3": "categorical"}) y = pd.Series([0, 1, 1, 1, 0]) encoder = TargetEncoder(smoothing=1) encoder.fit(X, y) diff --git a/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py b/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py index b5d8453952..4a5d66929c 100644 --- a/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py +++ b/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py @@ -236,14 +236,14 @@ def test_invalid_target_data_input_formats(): assert messages == expected # test list - y = [None, None, None, 0] + y = [np.nan, np.nan, np.nan, 0] X = pd.DataFrame({"col": range(len(y))}) messages = invalid_targets_check.validate(X, y) assert messages == expected # test np.array - y = np.array([None, None, None, 0]) + y = np.array([np.nan, np.nan, np.nan, 0]) X = pd.DataFrame({"col": range(len(y))}) messages = invalid_targets_check.validate(X, y) From 708ea7bf8194c5352c3434b84fe6c4169e4f77da Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 27 Aug 2021 12:22:16 +0100 Subject: [PATCH 11/36] one hot encoder updates --- .../component_tests/test_one_hot_encoder.py | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py index b5568980f1..24c0f0ba8e 100644 --- a/evalml/tests/component_tests/test_one_hot_encoder.py +++ b/evalml/tests/component_tests/test_one_hot_encoder.py @@ -166,6 +166,7 @@ def test_drop_first(): "col_3": ["a", "a", "a", "a", "a"], } ) + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) encoder = OneHotEncoder(top_n=None, drop="first", handle_unknown="error") encoder.fit(X) X_t = encoder.transform(X) @@ -216,6 +217,7 @@ def test_drop_binary_and_top_n_2(): "col_3": ["a", "a", "a", "a", "a"], } ) + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) encoder = OneHotEncoder(top_n=2, drop="if_binary") encoder.fit(X) X_t = encoder.transform(X) @@ -233,7 +235,7 @@ def test_handle_unknown(): "col_4": [2, 0, 1, 3, 0, 1, 2], } ) - + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) encoder = OneHotEncoder(handle_unknown="error") encoder.fit(X) assert isinstance(encoder.transform(X), pd.DataFrame) @@ -299,6 +301,7 @@ def test_categories(): "col_4": [2, 0, 1, 3, 0, 1, 2], } ) + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]] @@ -373,6 +376,7 @@ def test_more_top_n_unique_values(): "col_4": [2, 0, 1, 3, 0, 1, 2], } ) + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) random_seed = 2 @@ -521,21 +525,24 @@ def test_large_number_of_categories(): @pytest.mark.parametrize("data_type", ["list", "np", "pd_no_index", "pd_index", "ww"]) def test_data_types(data_type): if data_type == "list": - X = [["a"], ["b"], ["c"]] + X = [["a"], ["b"], ["c"]]*5 elif data_type == "np": - X = np.array([["a"], ["b"], ["c"]]) + X = np.array([["a"], ["b"], ["c"]]*5) elif data_type == "pd_no_index": - X = pd.DataFrame(["a", "b", "c"]) + X = pd.DataFrame(["a", "b", "c"]*5) elif data_type == "pd_index": - X = pd.DataFrame(["a", "b", "c"], columns=["0"]) + X = pd.DataFrame(["a", "b", "c"]*5, columns=["0"]) elif data_type == "ww": - X = pd.DataFrame(["a", "b", "c"]) + X = pd.DataFrame(["a", "b", "c"]*5) X.ww.init() encoder = OneHotEncoder() encoder.fit(X) X_t = encoder.transform(X) assert list(X_t.columns) == ["0_a", "0_b", "0_c"] - np.testing.assert_array_equal(X_t.to_numpy(), np.identity(3)) + mask = np.identity(3) + for _ in range(4): + mask = np.vstack((mask, np.identity(3))) + np.testing.assert_array_equal(X_t.to_numpy(), mask) @pytest.mark.parametrize( @@ -563,6 +570,7 @@ def test_ohe_categories(): X = pd.DataFrame( {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2} ) + X.ww.init(logical_types={"col_2": "categorical"}) ohe = OneHotEncoder(top_n=2) with pytest.raises( ComponentNotYetFittedError, @@ -584,6 +592,7 @@ def test_ohe_get_feature_names(): X = pd.DataFrame( {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2} ) + X.ww.init(logical_types={"col_2": "categorical"}) ohe = OneHotEncoder(top_n=2) with pytest.raises( ComponentNotYetFittedError, @@ -671,6 +680,7 @@ def check_df_equality(random_seed): def test_ohe_column_names_unique(): df = pd.DataFrame({"A": ["x_y"], "A_x": ["y"]}) + df.ww.init(logical_types={"A": "categorical", "A_x": "categorical"}) df_transformed = OneHotEncoder().fit_transform(df) assert set(df_transformed.columns) == {"A_x_y", "A_x_y_1"} @@ -685,6 +695,7 @@ def test_ohe_column_names_unique(): "A_x_y": ["1", "y", "y"], } ) + df.ww.init(logical_types={"A": "categorical", "A_x": "categorical", "A_x_y": "categorical"}) df_transformed = OneHotEncoder().fit_transform(df) # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists # category 1 in A_x_y gets mapped to A_x_y_1_1 because A_x_y_1 already exists @@ -693,6 +704,7 @@ def test_ohe_column_names_unique(): df = pd.DataFrame( {"A": ["x_y", "z", "a"], "A_x": ["y_1", "y", "b"], "A_x_y": ["1", "y", "c"]} ) + df.ww.init(logical_types={"A": "categorical", "A_x": "categorical", "A_x_y": "categorical"}) df_transformed = OneHotEncoder().fit_transform(df) # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists # category y_1 in A_x gets mapped to A_x_y_1_1 because A_x_y_1 already exists From 63eb23a44f5e0a19d5623c69b8057a6a70779d81 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 27 Aug 2021 12:32:11 +0100 Subject: [PATCH 12/36] more ohe --- evalml/tests/component_tests/test_one_hot_encoder.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py index 24c0f0ba8e..076442342f 100644 --- a/evalml/tests/component_tests/test_one_hot_encoder.py +++ b/evalml/tests/component_tests/test_one_hot_encoder.py @@ -78,7 +78,7 @@ def test_null_values_in_dataframe(): "col_3": ["a", "a", "a", "a", "a"], } ) - + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical"}) # Test NaN will be counted as a category if within the top_n encoder = OneHotEncoder(handle_missing="as_category") encoder.fit(X) @@ -110,7 +110,7 @@ def test_null_values_in_dataframe(): "col_4": [2, 0, 1, np.nan, 0], } ) - + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical"}) encoder = OneHotEncoder(top_n=2, handle_missing="as_category") encoder.fit(X) X_t = encoder.transform(X) @@ -183,6 +183,7 @@ def test_drop_binary(): "col_3": ["a", "a", "a", "a", "a"], } ) + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) encoder = OneHotEncoder(top_n=None, drop="if_binary", handle_unknown="error") encoder.fit(X) X_t = encoder.transform(X) @@ -199,6 +200,7 @@ def test_drop_parameter_is_array(): "col_3": ["a", "a", "a", "a", "a"], } ) + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) encoder = OneHotEncoder(top_n=None, drop=["b", "c", "a"], handle_unknown="error") encoder.fit(X) X_t = encoder.transform(X) @@ -263,6 +265,7 @@ def test_no_top_n(): "col_4": [2, 0, 1, 3, 0, 1, 2, 0, 2, 1, 2], } ) + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical"}) expected_col_names = set(["col_3_b", "col_4"]) for val in X["col_1"]: expected_col_names.add("col_1_" + val) @@ -345,7 +348,7 @@ def test_less_than_top_n_unique_values(): "col_4": [2, 0, 1, 0, 0], } ) - + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical"}) encoder = OneHotEncoder(top_n=5) encoder.fit(X) X_t = encoder.transform(X) @@ -419,7 +422,7 @@ def test_more_top_n_unique_values_large(): "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1], } ) - + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) random_seed = 2 encoder = OneHotEncoder(top_n=3, random_seed=random_seed) @@ -455,6 +458,7 @@ def test_categorical_dtype(): } ) X["col_4"] = X["col_4"].astype("category") + X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical"}) encoder = OneHotEncoder(top_n=5) encoder.fit(X) From 86ca4524f1e0a8dd8fbf8aecb6b36bc204e9c036 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 27 Aug 2021 12:54:28 +0100 Subject: [PATCH 13/36] segmentation fault --- evalml/tests/component_tests/test_lgbm_classifier.py | 2 ++ evalml/tests/component_tests/test_lgbm_regressor.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py index bd6dd0ba41..5db4d88d82 100644 --- a/evalml/tests/component_tests/test_lgbm_classifier.py +++ b/evalml/tests/component_tests/test_lgbm_classifier.py @@ -175,6 +175,7 @@ def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary): X = pd.DataFrame( {"feature_1": [0, 0, 1, 1, 0, 1]*2, "feature_2": ["a", "a", "b", "b", "c", "c"]*2} ) + X.ww.init(logical_types={"feature_2": "categorical"}) y = pd.Series([1, 1, 0, 0, 0, 1]*2) X_expected = pd.DataFrame( {0: [0, 0, 1, 1, 0, 1]*2, 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]*2} @@ -182,6 +183,7 @@ def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary): X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype("category") X_subset = pd.DataFrame({"feature_1": [1, 0], "feature_2": ["c", "a"]}) + X_subset.ww.init(logical_types={"feature_2": "categorical"}) X_expected_subset = pd.DataFrame({0: [1, 0], 1: [2.0, 0.0]}) X_expected_subset.iloc[:, 1] = X_expected_subset.iloc[:, 1].astype("category") diff --git a/evalml/tests/component_tests/test_lgbm_regressor.py b/evalml/tests/component_tests/test_lgbm_regressor.py index 1b20b23bd4..1390cd1e0d 100644 --- a/evalml/tests/component_tests/test_lgbm_regressor.py +++ b/evalml/tests/component_tests/test_lgbm_regressor.py @@ -123,6 +123,7 @@ def test_categorical_data_subset(mock_predict, X_y_regression): X = pd.DataFrame( {"feature_1": [0, 0, 1, 1, 0, 1], "feature_2": ["a", "a", "b", "b", "c", "c"]} ) + X.ww.init(logical_types={"feature_2": "categorical"}) y = pd.Series([1, 1, 0, 0, 0, 1]) X_expected = pd.DataFrame( {0: [0, 0, 1, 1, 0, 1], 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]} @@ -130,6 +131,7 @@ def test_categorical_data_subset(mock_predict, X_y_regression): X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype("category") X_subset = pd.DataFrame({"feature_1": [1, 0], "feature_2": ["c", "a"]}) + X_subset.ww.init(logical_types={"feature_2": "categorical"}) X_expected_subset = pd.DataFrame({0: [1, 0], 1: [2.0, 0.0]}) X_expected_subset.iloc[:, 1] = X_expected_subset.iloc[:, 1].astype("category") From 7871c74c811953e70e7bcd2b761e3f94dbb41df6 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 27 Aug 2021 15:27:17 +0100 Subject: [PATCH 14/36] lgbm, per column, simple imputer --- evalml/tests/component_tests/test_lgbm_classifier.py | 4 ++++ .../tests/component_tests/test_per_column_imputer.py | 5 +++-- evalml/tests/component_tests/test_simple_imputer.py | 12 ++++++++---- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py index 5db4d88d82..052211823c 100644 --- a/evalml/tests/component_tests/test_lgbm_classifier.py +++ b/evalml/tests/component_tests/test_lgbm_classifier.py @@ -205,7 +205,9 @@ def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary): def test_multiple_fit(mock_predict, mock_predict_proba): y = pd.Series([1] * 4) X1_fit = pd.DataFrame({"feature": ["a", "b", "c", "c"]}) + X1_fit.ww.init(logical_types={"feature": "categorical"}) X1_predict = pd.DataFrame({"feature": ["a", "a", "b", "c"]}) + X1_predict.ww.init(logical_types={"feature": "categorical"}) X1_predict_expected = pd.DataFrame({0: [0.0, 0.0, 1.0, 2.0]}, dtype="category") clf = LightGBMClassifier() @@ -217,7 +219,9 @@ def test_multiple_fit(mock_predict, mock_predict_proba): # Check if it will fit a different dataset with new variable X2_fit = pd.DataFrame({"feature": ["c", "b", "a", "d"]}) + X2_fit.ww.init(logical_types={"feature": "categorical"}) X2_predict = pd.DataFrame({"feature": ["d", "c", "b", "a"]}) + X2_predict.ww.init(logical_types={"feature": "categorical"}) X2_predict_expected = pd.DataFrame({0: [3.0, 2.0, 1.0, 0.0]}, dtype="category") clf = LightGBMClassifier() diff --git a/evalml/tests/component_tests/test_per_column_imputer.py b/evalml/tests/component_tests/test_per_column_imputer.py index 7dcdbdc690..88988b50e9 100644 --- a/evalml/tests/component_tests/test_per_column_imputer.py +++ b/evalml/tests/component_tests/test_per_column_imputer.py @@ -47,6 +47,7 @@ def test_all_strategies(): "D": pd.Series(["a", "a", "b", np.nan]), } ) + X.ww.init(logical_types={"D": "categorical"}) X_expected = pd.DataFrame( { @@ -91,7 +92,7 @@ def test_fit_transform(): def test_non_numeric_errors(non_numeric_df): # test col with all strings X = non_numeric_df - + X.ww.init(logical_types={"A": "categorical", "B": "categorical", "C": "categorical", "D": "categorical"}) # mean with all strings strategies = {"A": {"impute_strategy": "mean"}} with pytest.raises( @@ -121,7 +122,7 @@ def test_non_numeric_errors(non_numeric_df): def test_non_numeric_valid(non_numeric_df): X = non_numeric_df - + X.ww.init(logical_types={"A": "categorical", "B": "categorical", "C": "categorical", "D": "categorical"}) # most frequent with all strings strategies = {"C": {"impute_strategy": "most_frequent"}} transformer = PerColumnImputer(impute_strategies=strategies) diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py index 0298d97357..cdd83af90e 100644 --- a/evalml/tests/component_tests/test_simple_imputer.py +++ b/evalml/tests/component_tests/test_simple_imputer.py @@ -44,7 +44,7 @@ def test_simple_imputer_mean(): def test_simple_imputer_constant(): # test impute strategy is constant and fill value is not specified X = pd.DataFrame([[np.nan, 0, 1, np.nan], ["a", 2, np.nan, 3], ["b", 2, 3, 0]]) - + X.ww.init(logical_types={0: "categorical", 1: "Double", 2: "Double", 3: "Double"}) transformer = SimpleImputer(impute_strategy="constant", fill_value=3) X_expected_arr = pd.DataFrame([[3, 0, 1, 3], ["a", 2, 3, 3], ["b", 2, 3, 0]]) X_expected_arr = X_expected_arr.astype({0: "category"}) @@ -54,7 +54,7 @@ def test_simple_imputer_constant(): def test_simple_imputer_most_frequent(): X = pd.DataFrame([[np.nan, 0, 1, np.nan], ["a", 2, np.nan, 3], ["b", 2, 1, 0]]) - + X.ww.init(logical_types={0: "categorical", 1: "Double", 2: "Double", 3: "Double"}) transformer = SimpleImputer(impute_strategy="most_frequent") X_expected_arr = pd.DataFrame([["a", 0, 1, 0], ["a", 2, 1, 3], ["b", 2, 1, 0]]) X_expected_arr = X_expected_arr.astype({0: "category"}) @@ -67,7 +67,7 @@ def test_simple_imputer_col_with_non_numeric(): X = pd.DataFrame( [["a", 0, 1, np.nan], ["b", 2, 3, 3], ["a", 2, 3, 1], [np.nan, 2, 3, 0]] ) - + X.ww.init(logical_types={0: "categorical", 1: "Double", 2: "Double", 3: "Double"}) transformer = SimpleImputer(impute_strategy="mean") with pytest.raises( ValueError, match="Cannot use mean strategy with non-numeric data" @@ -121,6 +121,7 @@ def test_simple_imputer_all_bool_return_original(data_type, make_data_type): @pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_simple_imputer_boolean_dtype(data_type, make_data_type): X = pd.DataFrame([True, np.nan, False, np.nan, True]) + X.ww.init(logical_types={0: "categorical"}) y = pd.Series([1, 0, 0, 1, 0]) X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype="category") X = make_data_type(data_type, X) @@ -138,6 +139,7 @@ def test_simple_imputer_multitype_with_one_bool(data_type, make_data_type): "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), } ) + X_multi.ww.init(logical_types={"bool with nan": "categorical"}) y = pd.Series([1, 0, 0, 1, 0]) X_multi_expected_arr = pd.DataFrame( { @@ -256,6 +258,7 @@ def test_simple_imputer_fill_value(data_type): ), } ) + X.ww.init(logical_types={"categorical with nan": "categorical", "object with nan": "categorical"}) y = pd.Series([0, 0, 1, 0, 1]) imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value) imputer.fit(X, y) @@ -321,6 +324,7 @@ def test_simple_imputer_with_none(): "all None": [None, None, None, None], } ) + X.ww.init(logical_types={"boolean with None": "categorical", "object with None": "categorical", "all None": "categorical"}) y = pd.Series([0, 0, 1, 0, 1]) imputer = SimpleImputer() imputer.fit(X, y) @@ -343,7 +347,7 @@ def test_simple_imputer_supports_natural_language_constant(): } ) y = pd.Series([0, 0, 1, 0, 1]) - X.ww.init(logical_types={"natural language col": "NaturalLanguage"}) + X.ww.init(logical_types={"cat with None": "categorical", "natural language col": "NaturalLanguage"}) imputer = SimpleImputer(impute_strategy="constant", fill_value="placeholder") imputer.fit(X, y) transformed = imputer.transform(X, y) From c336f9d1a065578ddb4b99f0516d93a5cd2b0a45 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 11:14:07 +0100 Subject: [PATCH 15/36] imputer and partial dependence --- evalml/tests/component_tests/test_imputer.py | 3 ++- evalml/tests/conftest.py | 2 ++ .../tests/model_understanding_tests/test_partial_dependence.py | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py index cd75b4044a..28d4ebcd6e 100644 --- a/evalml/tests/component_tests/test_imputer.py +++ b/evalml/tests/component_tests/test_imputer.py @@ -259,9 +259,10 @@ def test_imputer_does_not_reset_index(): X.loc[5, "input_val"] = np.nan X.loc[5, "input_cat"] = np.nan assert X.index.tolist() == list(range(10)) + X.ww.init(logical_types={"input_cat": "categorical"}) X.drop(0, inplace=True) - y = X.pop("target") + y = X.ww.pop("target") imputer = Imputer() imputer.fit(X, y=y) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index b80661d0bf..2572dc97fb 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -43,6 +43,7 @@ handle_problem_types, is_regression, ) +from evalml.utils import infer_feature_types def pytest_configure(config): @@ -741,6 +742,7 @@ def decision_tree_classification_pipeline_class(X_y_categorical_classification): } ) X, y = X_y_categorical_classification + X.ww.init(logical_types={"Ticket": "categorical", "Cabin": "categorical"}) pipeline.fit(X, y) return pipeline diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py index 6751b29a30..30b33783f9 100644 --- a/evalml/tests/model_understanding_tests/test_partial_dependence.py +++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py @@ -21,6 +21,7 @@ RegressionPipeline, ) from evalml.problem_types import ProblemTypes +from evalml.utils import infer_feature_types @pytest.fixture @@ -209,6 +210,7 @@ def test_partial_dependence_catboost( "also string": ["c", "b", "a"], } ) + X.ww.init(logical_types={"string": "categorical", "also string": "categorical"}) pipeline = pipeline_class( component_graph=["CatBoost Classifier"], parameters={"CatBoost Classifier": {"thread_count": 1}}, From 1fd623133dd9b9fd57922428cf5b9c00e45aca6e Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 11:27:11 +0100 Subject: [PATCH 16/36] pip install scikit-learn --- .github/workflows/windows_unit_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/windows_unit_tests.yml b/.github/workflows/windows_unit_tests.yml index b5c0eba6f1..041827d3d6 100644 --- a/.github/workflows/windows_unit_tests.yml +++ b/.github/workflows/windows_unit_tests.yml @@ -66,6 +66,7 @@ jobs: conda activate curr_py python -m pip install --upgrade pip python -m pip install . + python -m pip install scikit-learn python -m pip install -r test-requirements.txt pip freeze - name: Run unit tests From b086a1628d35ab45d1c9a520a81a356fd6d3d9bf Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 11:34:54 +0100 Subject: [PATCH 17/36] install woodwork --- .github/workflows/windows_unit_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/windows_unit_tests.yml b/.github/workflows/windows_unit_tests.yml index 041827d3d6..ccbe5231c6 100644 --- a/.github/workflows/windows_unit_tests.yml +++ b/.github/workflows/windows_unit_tests.yml @@ -67,6 +67,7 @@ jobs: python -m pip install --upgrade pip python -m pip install . python -m pip install scikit-learn + python -m pip install woodwork==0.5.1 python -m pip install -r test-requirements.txt pip freeze - name: Run unit tests From 370f3372dbbc152b1d6839059b6cabdf3c9dde26 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 11:41:47 +0100 Subject: [PATCH 18/36] no message --- .github/workflows/windows_unit_tests.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/windows_unit_tests.yml b/.github/workflows/windows_unit_tests.yml index ccbe5231c6..b5c0eba6f1 100644 --- a/.github/workflows/windows_unit_tests.yml +++ b/.github/workflows/windows_unit_tests.yml @@ -66,8 +66,6 @@ jobs: conda activate curr_py python -m pip install --upgrade pip python -m pip install . - python -m pip install scikit-learn - python -m pip install woodwork==0.5.1 python -m pip install -r test-requirements.txt pip freeze - name: Run unit tests From 64868afa83c46dfce8847a21f310ba75bcb96089 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 12:14:29 +0100 Subject: [PATCH 19/36] test_explainers --- .../prediction_explanations_tests/test_explainers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py index 2405e2cabd..9f847ca582 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py @@ -1627,6 +1627,7 @@ def test_explain_predictions_oversampler(estimator, fraud_100): reason="Skipping test because imbalanced-learn not installed", ) X, y = fraud_100 + X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) pipeline = BinaryClassificationPipeline( component_graph={ "Imputer": ["Imputer", "X", "y"], From 6b330649ec6be61ce1f0325c58514246b5c994be Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 12:34:56 +0100 Subject: [PATCH 20/36] plotly update --- .../dependency_update_check/latest_dependency_versions.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt index 9fe6b759ae..47683d73c1 100644 --- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt +++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt @@ -16,7 +16,7 @@ nlp-primitives==1.1.0 numba==0.53.0 numpy==1.21.2 pandas==1.3.2 -plotly==5.2.2 +plotly==5.3.0 pmdarima==1.8.0 psutil==5.8.0 pyzmq==22.2.1 From d56a5caca34aecfb7a9daa292cdbc8223c4cdaa3 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 13:10:19 +0100 Subject: [PATCH 21/36] partial dependence --- .../test_partial_dependence.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py index 30b33783f9..8af3f12b70 100644 --- a/evalml/tests/model_understanding_tests/test_partial_dependence.py +++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py @@ -185,11 +185,11 @@ def test_partial_dependence_catboost( if problem_type == ProblemTypes.BINARY: X, y = X_y_binary - y_small = ["a", "b", "a"] + y_small = ["a", "b", "a"]*5 pipeline_class = BinaryClassificationPipeline else: X, y = X_y_multi - y_small = ["a", "b", "c"] + y_small = ["a", "b", "c"]*5 pipeline_class = MulticlassClassificationPipeline pipeline = pipeline_class( @@ -204,13 +204,12 @@ def test_partial_dependence_catboost( # test that CatBoost can natively handle non-numerical columns as feature passed to partial_dependence X = pd.DataFrame( { - "numeric": [1, 2, 3], - "also numeric": [2, 3, 4], - "string": ["a", "b", "c"], - "also string": ["c", "b", "a"], + "numeric": [1, 2, 3]*5, + "also numeric": [2, 3, 4]*5, + "string": ["a", "b", "c"]*5, + "also string": ["c", "b", "a"]*5, } ) - X.ww.init(logical_types={"string": "categorical", "also string": "categorical"}) pipeline = pipeline_class( component_graph=["CatBoost Classifier"], parameters={"CatBoost Classifier": {"thread_count": 1}}, From 77374a9795f1ffc0df8a26c648468a4770dc6367 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 13:14:53 +0100 Subject: [PATCH 22/36] lint fixes --- evalml/tests/component_tests/test_imputer.py | 150 ++++++++++-------- .../component_tests/test_lgbm_classifier.py | 9 +- .../component_tests/test_one_hot_encoder.py | 82 ++++++++-- .../test_per_column_imputer.py | 18 ++- .../component_tests/test_simple_imputer.py | 22 ++- .../component_tests/test_target_encoder.py | 22 ++- .../component_tests/test_target_imputer.py | 24 +-- .../data_checks_tests/test_data_checks.py | 7 +- .../test_id_columns_data_check.py | 9 +- .../test_multicollinearity_data_check.py | 10 +- .../test_explainers.py | 32 +++- .../test_partial_dependence.py | 22 +-- .../test_permutation_importance.py | 4 +- .../test_regression.py | 9 +- .../pipeline_tests/test_component_graph.py | 15 +- .../pipeline_tests/test_pipeline_utils.py | 22 +-- 16 files changed, 311 insertions(+), 146 deletions(-) diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py index 28d4ebcd6e..608f62c02b 100644 --- a/evalml/tests/component_tests/test_imputer.py +++ b/evalml/tests/component_tests/test_imputer.py @@ -19,24 +19,24 @@ def imputer_test_data(): return pd.DataFrame( { "categorical col": pd.Series( - ["zero", "one", "two", "zero", "two"]*4, dtype="category" + ["zero", "one", "two", "zero", "two"] * 4, dtype="category" ), - "int col": [0, 1, 2, 0, 3]*4, - "object col": ["b", "b", "a", "c", "d"]*4, - "float col": [0.0, 1.0, 0.0, -2.0, 5.0]*4, - "bool col": [True, False, False, True, True]*4, + "int col": [0, 1, 2, 0, 3] * 4, + "object col": ["b", "b", "a", "c", "d"] * 4, + "float col": [0.0, 1.0, 0.0, -2.0, 5.0] * 4, + "bool col": [True, False, False, True, True] * 4, "categorical with nan": pd.Series( - [np.nan, "1", "0", "0", "3"]*4, dtype="category" + [np.nan, "1", "0", "0", "3"] * 4, dtype="category" ), - "int with nan": [np.nan, 1, 0, 0, 1]*4, - "float with nan": [0.0, 1.0, np.nan, -1.0, 0.0]*4, - "object with nan": ["b", "b", np.nan, "c", np.nan]*4, + "int with nan": [np.nan, 1, 0, 0, 1] * 4, + "float with nan": [0.0, 1.0, np.nan, -1.0, 0.0] * 4, + "object with nan": ["b", "b", np.nan, "c", np.nan] * 4, "bool col with nan": pd.Series( - [True, np.nan, False, np.nan, True]*4, dtype="category" + [True, np.nan, False, np.nan, True] * 4, dtype="category" ), - "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan]*4, + "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan] * 4, "all nan cat": pd.Series( - [np.nan, np.nan, np.nan, np.nan, np.nan]*4, dtype="category" + [np.nan, np.nan, np.nan, np.nan, np.nan] * 4, dtype="category" ), } ) @@ -91,16 +91,16 @@ def test_numeric_only_input(imputer_test_data): X = imputer_test_data[ ["int col", "float col", "int with nan", "float with nan", "all nan"] ] - y = pd.Series([0, 0, 1, 0, 1]*4) + y = pd.Series([0, 0, 1, 0, 1] * 4) imputer = Imputer(numeric_impute_strategy="median") imputer.fit(X, y) transformed = imputer.transform(X, y) expected = pd.DataFrame( { - "int col": [0, 1, 2, 0, 3]*4, - "float col": [0.0, 1.0, 0.0, -2.0, 5.0]*4, - "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0]*4, - "float with nan": [0.0, 1.0, 0, -1.0, 0.0]*4, + "int col": [0, 1, 2, 0, 3] * 4, + "float col": [0.0, 1.0, 0.0, -2.0, 5.0] * 4, + "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0] * 4, + "float with nan": [0.0, 1.0, 0, -1.0, 0.0] * 4, } ) assert_frame_equal(transformed, expected, check_dtype=False) @@ -122,21 +122,23 @@ def test_categorical_only_input(imputer_test_data): "all nan cat", ] ] - y = pd.Series([0, 0, 1, 0, 1]*4) + y = pd.Series([0, 0, 1, 0, 1] * 4) expected = pd.DataFrame( { "categorical col": pd.Series( - ["zero", "one", "two", "zero", "two"]*4, dtype="category" + ["zero", "one", "two", "zero", "two"] * 4, dtype="category" ), - "object col": pd.Series(["b", "b", "a", "c", "d"]*4, dtype="category"), - "bool col": [True, False, False, True, True]*4, + "object col": pd.Series(["b", "b", "a", "c", "d"] * 4, dtype="category"), + "bool col": [True, False, False, True, True] * 4, "categorical with nan": pd.Series( - ["0", "1", "0", "0", "3"]*4, dtype="category" + ["0", "1", "0", "0", "3"] * 4, dtype="category" + ), + "object with nan": pd.Series( + ["b", "b", "b", "c", "b"] * 4, dtype="category" ), - "object with nan": pd.Series(["b", "b", "b", "c", "b"]*4, dtype="category"), "bool col with nan": pd.Series( - [True, True, False, True, True]*4, dtype="category" + [True, True, False, True, True] * 4, dtype="category" ), } ) @@ -157,20 +159,22 @@ def test_categorical_and_numeric_input(imputer_test_data): expected = pd.DataFrame( { "categorical col": pd.Series( - ["zero", "one", "two", "zero", "two"]*4, dtype="category" + ["zero", "one", "two", "zero", "two"] * 4, dtype="category" ), - "int col": [0, 1, 2, 0, 3]*4, - "object col": pd.Series(["b", "b", "a", "c", "d"]*4, dtype="category"), - "float col": [0.0, 1.0, 0.0, -2.0, 5.0]*4, - "bool col": [True, False, False, True, True]*4, + "int col": [0, 1, 2, 0, 3] * 4, + "object col": pd.Series(["b", "b", "a", "c", "d"] * 4, dtype="category"), + "float col": [0.0, 1.0, 0.0, -2.0, 5.0] * 4, + "bool col": [True, False, False, True, True] * 4, "categorical with nan": pd.Series( - ["0", "1", "0", "0", "3"]*4, dtype="category" + ["0", "1", "0", "0", "3"] * 4, dtype="category" + ), + "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0] * 4, + "float with nan": [0.0, 1.0, 0, -1.0, 0.0] * 4, + "object with nan": pd.Series( + ["b", "b", "b", "c", "b"] * 4, dtype="category" ), - "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0]*4, - "float with nan": [0.0, 1.0, 0, -1.0, 0.0]*4, - "object with nan": pd.Series(["b", "b", "b", "c", "b"]*4, dtype="category"), "bool col with nan": pd.Series( - [True, True, False, True, True]*4, dtype="category" + [True, True, False, True, True] * 4, dtype="category" ), } ) @@ -183,7 +187,7 @@ def test_categorical_and_numeric_input(imputer_test_data): def test_drop_all_columns(imputer_test_data): X = imputer_test_data[["all nan cat", "all nan"]] - y = pd.Series([0, 0, 1, 0, 1]*4) + y = pd.Series([0, 0, 1, 0, 1] * 4) X.ww.init() imputer = Imputer() imputer.fit(X, y) @@ -289,7 +293,7 @@ def test_imputer_fill_value(imputer_test_data): "bool col with nan", ] ] - y = pd.Series([0, 0, 1, 0, 1]*4) + y = pd.Series([0, 0, 1, 0, 1] * 4) imputer = Imputer( categorical_impute_strategy="constant", numeric_impute_strategy="constant", @@ -300,16 +304,16 @@ def test_imputer_fill_value(imputer_test_data): transformed = imputer.transform(X, y) expected = pd.DataFrame( { - "int with nan": [-1, 1, 0, 0, 1]*4, + "int with nan": [-1, 1, 0, 0, 1] * 4, "categorical with nan": pd.Series( - ["fill", "1", "0", "0", "3"]*4, dtype="category" + ["fill", "1", "0", "0", "3"] * 4, dtype="category" ), - "float with nan": [0.0, 1.0, -1, -1.0, 0.0]*4, + "float with nan": [0.0, 1.0, -1, -1.0, 0.0] * 4, "object with nan": pd.Series( - ["b", "b", "fill", "c", "fill"]*4, dtype="category" + ["b", "b", "fill", "c", "fill"] * 4, dtype="category" ), "bool col with nan": pd.Series( - [True, "fill", False, "fill", True]*4, dtype="category" + [True, "fill", False, "fill", True] * 4, dtype="category" ), } ) @@ -327,7 +331,7 @@ def test_imputer_fill_value(imputer_test_data): def test_imputer_no_nans(imputer_test_data): X = imputer_test_data[["categorical col", "object col", "bool col"]] - y = pd.Series([0, 0, 1, 0, 1]*4) + y = pd.Series([0, 0, 1, 0, 1] * 4) imputer = Imputer( categorical_impute_strategy="constant", numeric_impute_strategy="constant", @@ -339,10 +343,10 @@ def test_imputer_no_nans(imputer_test_data): expected = pd.DataFrame( { "categorical col": pd.Series( - ["zero", "one", "two", "zero", "two"]*4, dtype="category" + ["zero", "one", "two", "zero", "two"] * 4, dtype="category" ), - "object col": pd.Series(["b", "b", "a", "c", "d"]*4, dtype="category"), - "bool col": [True, False, False, True, True]*4, + "object col": pd.Series(["b", "b", "a", "c", "d"] * 4, dtype="category"), + "bool col": [True, False, False, True, True] * 4, } ) assert_frame_equal(transformed, expected, check_dtype=False) @@ -360,25 +364,29 @@ def test_imputer_no_nans(imputer_test_data): def test_imputer_with_none(): X = pd.DataFrame( { - "int with None": [1, 0, 5, None]*4, - "float with None": [0.1, 0.0, 0.5, None]*4, - "category with None": pd.Series(["b", "a", "a", None]*4, dtype="category"), - "boolean with None": pd.Series([True, None, False, True]*4), - "object with None": ["b", "a", "a", None]*4, - "all None": [None, None, None, None]*4, + "int with None": [1, 0, 5, None] * 4, + "float with None": [0.1, 0.0, 0.5, None] * 4, + "category with None": pd.Series( + ["b", "a", "a", None] * 4, dtype="category" + ), + "boolean with None": pd.Series([True, None, False, True] * 4), + "object with None": ["b", "a", "a", None] * 4, + "all None": [None, None, None, None] * 4, } ) - y = pd.Series([0, 0, 1, 0, 1]*4) + y = pd.Series([0, 0, 1, 0, 1] * 4) imputer = Imputer() imputer.fit(X, y) transformed = imputer.transform(X, y) expected = pd.DataFrame( { - "int with None": [1, 0, 5, 2]*4, - "float with None": [0.1, 0.0, 0.5, 0.2]*4, - "category with None": pd.Series(["b", "a", "a", "a"]*4, dtype="category"), - "boolean with None": pd.Series([True, True, False, True]*4, dtype="category"), - "object with None": pd.Series(["b", "a", "a", "a"]*4, dtype="category"), + "int with None": [1, 0, 5, 2] * 4, + "float with None": [0.1, 0.0, 0.5, 0.2] * 4, + "category with None": pd.Series(["b", "a", "a", "a"] * 4, dtype="category"), + "boolean with None": pd.Series( + [True, True, False, True] * 4, dtype="category" + ), + "object with None": pd.Series(["b", "a", "a", "a"] * 4, dtype="category"), } ) assert_frame_equal(expected, transformed, check_dtype=False) @@ -404,9 +412,9 @@ def test_imputer_all_bool_return_original(data_type, make_data_type): @pytest.mark.parametrize("data_type", ["pd", "ww"]) def test_imputer_bool_dtype_object(data_type, make_data_type): - X = pd.DataFrame([True, np.nan, False, np.nan, True]*4) - y = pd.Series([1, 0, 0, 1, 0]*4) - X_expected_arr = pd.DataFrame([True, True, False, True, True]*4, dtype="category") + X = pd.DataFrame([True, np.nan, False, np.nan, True] * 4) + y = pd.Series([1, 0, 0, 1, 0] * 4) + X_expected_arr = pd.DataFrame([True, True, False, True, True] * 4, dtype="category") X = make_data_type(data_type, X) y = make_data_type(data_type, y) imputer = Imputer() @@ -419,17 +427,21 @@ def test_imputer_bool_dtype_object(data_type, make_data_type): def test_imputer_multitype_with_one_bool(data_type, make_data_type): X_multi = pd.DataFrame( { - "bool with nan": pd.Series([True, np.nan, False, np.nan, False]*4), - "bool no nan": pd.Series([False, False, False, False, True]*4, dtype=bool), + "bool with nan": pd.Series([True, np.nan, False, np.nan, False] * 4), + "bool no nan": pd.Series( + [False, False, False, False, True] * 4, dtype=bool + ), } ) - y = pd.Series([1, 0, 0, 1, 0]*4) + y = pd.Series([1, 0, 0, 1, 0] * 4) X_multi_expected_arr = pd.DataFrame( { "bool with nan": pd.Series( - [True, False, False, False, False]*4, dtype="category" + [True, False, False, False, False] * 4, dtype="category" + ), + "bool no nan": pd.Series( + [False, False, False, False, True] * 4, dtype=bool ), - "bool no nan": pd.Series([False, False, False, False, True]*4, dtype=bool), } ) @@ -469,23 +481,23 @@ def test_imputer_int_preserved(): def test_imputer_bool_preserved(): - X = pd.DataFrame(pd.Series([True, False, True, np.nan]*4)) + X = pd.DataFrame(pd.Series([True, False, True, np.nan] * 4)) imputer = Imputer(categorical_impute_strategy="most_frequent") transformed = imputer.fit_transform(X) pd.testing.assert_frame_equal( transformed, - pd.DataFrame(pd.Series([True, False, True, True]*4, dtype="category")), + pd.DataFrame(pd.Series([True, False, True, True] * 4, dtype="category")), ) assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == { 0: Categorical } - X = pd.DataFrame(pd.Series([True, False, True, False]*4)) + X = pd.DataFrame(pd.Series([True, False, True, False] * 4)) imputer = Imputer(categorical_impute_strategy="most_frequent") transformed = imputer.fit_transform(X) pd.testing.assert_frame_equal( transformed, - pd.DataFrame(pd.Series([True, False, True, False]*4)), + pd.DataFrame(pd.Series([True, False, True, False] * 4)), check_dtype=False, ) assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == {0: Boolean} diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py index 052211823c..2a35d0ba7c 100644 --- a/evalml/tests/component_tests/test_lgbm_classifier.py +++ b/evalml/tests/component_tests/test_lgbm_classifier.py @@ -173,12 +173,15 @@ def test_correct_args(mock_predict, mock_predict_proba, X_y_binary): @patch("evalml.pipelines.components.estimators.estimator.Estimator.predict") def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary): X = pd.DataFrame( - {"feature_1": [0, 0, 1, 1, 0, 1]*2, "feature_2": ["a", "a", "b", "b", "c", "c"]*2} + { + "feature_1": [0, 0, 1, 1, 0, 1] * 2, + "feature_2": ["a", "a", "b", "b", "c", "c"] * 2, + } ) X.ww.init(logical_types={"feature_2": "categorical"}) - y = pd.Series([1, 1, 0, 0, 0, 1]*2) + y = pd.Series([1, 1, 0, 0, 0, 1] * 2) X_expected = pd.DataFrame( - {0: [0, 0, 1, 1, 0, 1]*2, 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]*2} + {0: [0, 0, 1, 1, 0, 1] * 2, 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0] * 2} ) X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype("category") diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py index 076442342f..36a5cad415 100644 --- a/evalml/tests/component_tests/test_one_hot_encoder.py +++ b/evalml/tests/component_tests/test_one_hot_encoder.py @@ -166,7 +166,13 @@ def test_drop_first(): "col_3": ["a", "a", "a", "a", "a"], } ) - X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) + X.ww.init( + logical_types={ + "col_1": "categorical", + "col_2": "categorical", + "col_3": "categorical", + } + ) encoder = OneHotEncoder(top_n=None, drop="first", handle_unknown="error") encoder.fit(X) X_t = encoder.transform(X) @@ -183,7 +189,13 @@ def test_drop_binary(): "col_3": ["a", "a", "a", "a", "a"], } ) - X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) + X.ww.init( + logical_types={ + "col_1": "categorical", + "col_2": "categorical", + "col_3": "categorical", + } + ) encoder = OneHotEncoder(top_n=None, drop="if_binary", handle_unknown="error") encoder.fit(X) X_t = encoder.transform(X) @@ -200,7 +212,13 @@ def test_drop_parameter_is_array(): "col_3": ["a", "a", "a", "a", "a"], } ) - X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) + X.ww.init( + logical_types={ + "col_1": "categorical", + "col_2": "categorical", + "col_3": "categorical", + } + ) encoder = OneHotEncoder(top_n=None, drop=["b", "c", "a"], handle_unknown="error") encoder.fit(X) X_t = encoder.transform(X) @@ -219,7 +237,13 @@ def test_drop_binary_and_top_n_2(): "col_3": ["a", "a", "a", "a", "a"], } ) - X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) + X.ww.init( + logical_types={ + "col_1": "categorical", + "col_2": "categorical", + "col_3": "categorical", + } + ) encoder = OneHotEncoder(top_n=2, drop="if_binary") encoder.fit(X) X_t = encoder.transform(X) @@ -237,7 +261,13 @@ def test_handle_unknown(): "col_4": [2, 0, 1, 3, 0, 1, 2], } ) - X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) + X.ww.init( + logical_types={ + "col_1": "categorical", + "col_2": "categorical", + "col_3": "categorical", + } + ) encoder = OneHotEncoder(handle_unknown="error") encoder.fit(X) assert isinstance(encoder.transform(X), pd.DataFrame) @@ -304,7 +334,13 @@ def test_categories(): "col_4": [2, 0, 1, 3, 0, 1, 2], } ) - X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) + X.ww.init( + logical_types={ + "col_1": "categorical", + "col_2": "categorical", + "col_3": "categorical", + } + ) categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]] @@ -379,7 +415,13 @@ def test_more_top_n_unique_values(): "col_4": [2, 0, 1, 3, 0, 1, 2], } ) - X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) + X.ww.init( + logical_types={ + "col_1": "categorical", + "col_2": "categorical", + "col_3": "categorical", + } + ) random_seed = 2 @@ -422,7 +464,13 @@ def test_more_top_n_unique_values_large(): "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1], } ) - X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) + X.ww.init( + logical_types={ + "col_1": "categorical", + "col_2": "categorical", + "col_3": "categorical", + } + ) random_seed = 2 encoder = OneHotEncoder(top_n=3, random_seed=random_seed) @@ -529,15 +577,15 @@ def test_large_number_of_categories(): @pytest.mark.parametrize("data_type", ["list", "np", "pd_no_index", "pd_index", "ww"]) def test_data_types(data_type): if data_type == "list": - X = [["a"], ["b"], ["c"]]*5 + X = [["a"], ["b"], ["c"]] * 5 elif data_type == "np": - X = np.array([["a"], ["b"], ["c"]]*5) + X = np.array([["a"], ["b"], ["c"]] * 5) elif data_type == "pd_no_index": - X = pd.DataFrame(["a", "b", "c"]*5) + X = pd.DataFrame(["a", "b", "c"] * 5) elif data_type == "pd_index": - X = pd.DataFrame(["a", "b", "c"]*5, columns=["0"]) + X = pd.DataFrame(["a", "b", "c"] * 5, columns=["0"]) elif data_type == "ww": - X = pd.DataFrame(["a", "b", "c"]*5) + X = pd.DataFrame(["a", "b", "c"] * 5) X.ww.init() encoder = OneHotEncoder() encoder.fit(X) @@ -699,7 +747,9 @@ def test_ohe_column_names_unique(): "A_x_y": ["1", "y", "y"], } ) - df.ww.init(logical_types={"A": "categorical", "A_x": "categorical", "A_x_y": "categorical"}) + df.ww.init( + logical_types={"A": "categorical", "A_x": "categorical", "A_x_y": "categorical"} + ) df_transformed = OneHotEncoder().fit_transform(df) # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists # category 1 in A_x_y gets mapped to A_x_y_1_1 because A_x_y_1 already exists @@ -708,7 +758,9 @@ def test_ohe_column_names_unique(): df = pd.DataFrame( {"A": ["x_y", "z", "a"], "A_x": ["y_1", "y", "b"], "A_x_y": ["1", "y", "c"]} ) - df.ww.init(logical_types={"A": "categorical", "A_x": "categorical", "A_x_y": "categorical"}) + df.ww.init( + logical_types={"A": "categorical", "A_x": "categorical", "A_x_y": "categorical"} + ) df_transformed = OneHotEncoder().fit_transform(df) # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists # category y_1 in A_x gets mapped to A_x_y_1_1 because A_x_y_1 already exists diff --git a/evalml/tests/component_tests/test_per_column_imputer.py b/evalml/tests/component_tests/test_per_column_imputer.py index 88988b50e9..f97390f5d3 100644 --- a/evalml/tests/component_tests/test_per_column_imputer.py +++ b/evalml/tests/component_tests/test_per_column_imputer.py @@ -92,7 +92,14 @@ def test_fit_transform(): def test_non_numeric_errors(non_numeric_df): # test col with all strings X = non_numeric_df - X.ww.init(logical_types={"A": "categorical", "B": "categorical", "C": "categorical", "D": "categorical"}) + X.ww.init( + logical_types={ + "A": "categorical", + "B": "categorical", + "C": "categorical", + "D": "categorical", + } + ) # mean with all strings strategies = {"A": {"impute_strategy": "mean"}} with pytest.raises( @@ -122,7 +129,14 @@ def test_non_numeric_errors(non_numeric_df): def test_non_numeric_valid(non_numeric_df): X = non_numeric_df - X.ww.init(logical_types={"A": "categorical", "B": "categorical", "C": "categorical", "D": "categorical"}) + X.ww.init( + logical_types={ + "A": "categorical", + "B": "categorical", + "C": "categorical", + "D": "categorical", + } + ) # most frequent with all strings strategies = {"C": {"impute_strategy": "most_frequent"}} transformer = PerColumnImputer(impute_strategies=strategies) diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py index cdd83af90e..7d0de8c978 100644 --- a/evalml/tests/component_tests/test_simple_imputer.py +++ b/evalml/tests/component_tests/test_simple_imputer.py @@ -258,7 +258,12 @@ def test_simple_imputer_fill_value(data_type): ), } ) - X.ww.init(logical_types={"categorical with nan": "categorical", "object with nan": "categorical"}) + X.ww.init( + logical_types={ + "categorical with nan": "categorical", + "object with nan": "categorical", + } + ) y = pd.Series([0, 0, 1, 0, 1]) imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value) imputer.fit(X, y) @@ -324,7 +329,13 @@ def test_simple_imputer_with_none(): "all None": [None, None, None, None], } ) - X.ww.init(logical_types={"boolean with None": "categorical", "object with None": "categorical", "all None": "categorical"}) + X.ww.init( + logical_types={ + "boolean with None": "categorical", + "object with None": "categorical", + "all None": "categorical", + } + ) y = pd.Series([0, 0, 1, 0, 1]) imputer = SimpleImputer() imputer.fit(X, y) @@ -347,7 +358,12 @@ def test_simple_imputer_supports_natural_language_constant(): } ) y = pd.Series([0, 0, 1, 0, 1]) - X.ww.init(logical_types={"cat with None": "categorical", "natural language col": "NaturalLanguage"}) + X.ww.init( + logical_types={ + "cat with None": "categorical", + "natural language col": "NaturalLanguage", + } + ) imputer = SimpleImputer(impute_strategy="constant", fill_value="placeholder") imputer.fit(X, y) transformed = imputer.transform(X, y) diff --git a/evalml/tests/component_tests/test_target_encoder.py b/evalml/tests/component_tests/test_target_encoder.py index fd7c5315e1..9d497a8c61 100644 --- a/evalml/tests/component_tests/test_target_encoder.py +++ b/evalml/tests/component_tests/test_target_encoder.py @@ -70,7 +70,13 @@ def test_null_values_in_dataframe(): "col_3": ["a", "a", "a", "a", "a"], } ) - X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"}) + X.ww.init( + logical_types={ + "col_1": "categorical", + "col_2": "categorical", + "col_3": "categorical", + } + ) y = pd.Series([0, 1, 1, 1, 0]) encoder = TargetEncoder(handle_missing="value") encoder.fit(X, y) @@ -117,13 +123,13 @@ def test_null_values_in_dataframe(): def test_cols(): X = pd.DataFrame( { - "col_1": [1, 2, 1, 1, 2]*2, - "col_2": ["2", "1", "1", "1", "1"]*2, - "col_3": ["a", "a", "a", "a", "a"]*2, + "col_1": [1, 2, 1, 1, 2] * 2, + "col_2": ["2", "1", "1", "1", "1"] * 2, + "col_3": ["a", "a", "a", "a", "a"] * 2, } ) X_expected = X.astype({"col_1": "int64", "col_2": "category", "col_3": "category"}) - y = pd.Series([0, 1, 1, 1, 0]*2) + y = pd.Series([0, 1, 1, 1, 0] * 2) encoder = TargetEncoder(cols=[]) encoder.fit(X, y) X_t = encoder.transform(X) @@ -134,9 +140,9 @@ def test_cols(): X_t = encoder.transform(X) X_expected = pd.DataFrame( { - "col_1": pd.Series([1, 2, 1, 1, 2]*2, dtype="int64"), - "col_2": [0.161365, 0.749863, 0.749863, 0.749863, 0.749863]*2, - "col_3": pd.Series(["a", "a", "a", "a", "a"]*2, dtype="category"), + "col_1": pd.Series([1, 2, 1, 1, 2] * 2, dtype="int64"), + "col_2": [0.161365, 0.749863, 0.749863, 0.749863, 0.749863] * 2, + "col_3": pd.Series(["a", "a", "a", "a", "a"] * 2, dtype="category"), } ) assert_frame_equal(X_expected, X_t, check_less_precise=True) diff --git a/evalml/tests/component_tests/test_target_imputer.py b/evalml/tests/component_tests/test_target_imputer.py index 8a9ad5ce26..9bdf615b66 100644 --- a/evalml/tests/component_tests/test_target_imputer.py +++ b/evalml/tests/component_tests/test_target_imputer.py @@ -57,11 +57,15 @@ def test_target_imputer_mean(): (None, pd.Series([np.nan, 0, 5]), pd.Series([0, 0, 5])), ( None, - pd.Series([np.nan, "a", "b"]*5), - pd.Series(["missing_value", "a", "b"]*5).astype("category"), + pd.Series([np.nan, "a", "b"] * 5), + pd.Series(["missing_value", "a", "b"] * 5).astype("category"), ), (3, pd.Series([np.nan, 0, 5]), pd.Series([3, 0, 5])), - (3, pd.Series([np.nan, "a", "b"]*5), pd.Series([3, "a", "b"]*5).astype("category")), + ( + 3, + pd.Series([np.nan, "a", "b"] * 5), + pd.Series([3, "a", "b"] * 5).astype("category"), + ), ], ) def test_target_imputer_constant(fill_value, y, y_expected): @@ -71,9 +75,9 @@ def test_target_imputer_constant(fill_value, y, y_expected): def test_target_imputer_most_frequent(): - y = pd.Series([np.nan, "a", "b"]*5) + y = pd.Series([np.nan, "a", "b"] * 5) imputer = TargetImputer(impute_strategy="most_frequent") - y_expected = pd.Series(["a", "a", "b"]*5).astype("category") + y_expected = pd.Series(["a", "a", "b"] * 5).astype("category") _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t, check_dtype=False) @@ -85,7 +89,7 @@ def test_target_imputer_most_frequent(): def test_target_imputer_col_with_non_numeric_with_numeric_strategy(): - y = pd.Series([np.nan, "a", "b"]*5) + y = pd.Series([np.nan, "a", "b"] * 5) imputer = TargetImputer(impute_strategy="mean") with pytest.raises( ValueError, match="Cannot use mean strategy with non-numeric data" @@ -190,16 +194,16 @@ def test_target_imputer_with_none(y, y_expected): "y, y_expected", [ ( - pd.Series(["b", "a", "a", None]*4, dtype="category"), - pd.Series(["b", "a", "a", "a"]*4, dtype="category"), + pd.Series(["b", "a", "a", None] * 4, dtype="category"), + pd.Series(["b", "a", "a", "a"] * 4, dtype="category"), ), ( pd.Series([True, None, False, True], dtype="category"), pd.Series([True, True, False, True], dtype="category"), ), ( - pd.Series(["b", "a", "a", None]*4), - pd.Series(["b", "a", "a", "a"]*4, dtype="category"), + pd.Series(["b", "a", "a", None] * 4), + pd.Series(["b", "a", "a", "a"] * 4, dtype="category"), ), ], ) diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py index 529fc66d68..fc1179cc81 100644 --- a/evalml/tests/data_checks_tests/test_data_checks.py +++ b/evalml/tests/data_checks_tests/test_data_checks.py @@ -340,7 +340,12 @@ def test_default_data_checks_regression(input_type): X["nan_dt_col"][0] = None y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) - X.ww.init(logical_types={"lots_of_null": "categorical", "natural_language_nan": "NaturalLanguage"}) + X.ww.init( + logical_types={ + "lots_of_null": "categorical", + "natural_language_nan": "NaturalLanguage", + } + ) if input_type == "ww": y = ww.init_series(y) y_no_variance = ww.init_series(y_no_variance) diff --git a/evalml/tests/data_checks_tests/test_id_columns_data_check.py b/evalml/tests/data_checks_tests/test_id_columns_data_check.py index f15a3c6f45..af5195e335 100644 --- a/evalml/tests/data_checks_tests/test_id_columns_data_check.py +++ b/evalml/tests/data_checks_tests/test_id_columns_data_check.py @@ -135,7 +135,14 @@ def test_id_columns_strings(): "col_6": [0.1, 0.2, 0.3, 0.4], } X = pd.DataFrame.from_dict(X_dict) - X.ww.init(logical_types={"col_1_id": "categorical", "col_2": "categorical", "Id": "categorical", "col_5": "categorical"}) + X.ww.init( + logical_types={ + "col_1_id": "categorical", + "col_2": "categorical", + "Id": "categorical", + "col_5": "categorical", + } + ) id_cols_check = IDColumnsDataCheck(id_threshold=0.95) assert id_cols_check.validate(X) == { "warnings": [ diff --git a/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py b/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py index 23961985e4..208ffe9be3 100644 --- a/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py +++ b/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py @@ -86,7 +86,15 @@ def test_multicollinearity_nonnumeric_cols(data_type, make_data_type): "col_6": [1, 1, 2, 3, 1], } ) - X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical", "col_4": "categorical", "col_5": "categorical"}) + X.ww.init( + logical_types={ + "col_1": "categorical", + "col_2": "categorical", + "col_3": "categorical", + "col_4": "categorical", + "col_5": "categorical", + } + ) multi_check = MulticollinearityDataCheck(threshold=0.9) assert multi_check.validate(X) == { diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py index 9f847ca582..6664e630d5 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py @@ -1263,7 +1263,9 @@ def transform_y_for_problem_type(problem_type, y): @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) def test_categories_aggregated_linear_pipeline(pipeline_class, estimator, fraud_100): X, y = fraud_100 - X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) + X.ww.init( + logical_types={"currency": "categorical", "expiration_date": "categorical"} + ) pipeline = pipeline_class( component_graph=[ @@ -1305,7 +1307,13 @@ def test_categories_aggregated_linear_pipeline(pipeline_class, estimator, fraud_ def test_categories_aggregated_text(pipeline_class, estimator, fraud_100): X, y = fraud_100 - X.ww.set_types(logical_types={"provider": "NaturalLanguage", "currency": "categorical", "expiration_date": "categorical"}) + X.ww.set_types( + logical_types={ + "provider": "NaturalLanguage", + "currency": "categorical", + "expiration_date": "categorical", + } + ) component_graph = [ "Select Columns Transformer", "One Hot Encoder", @@ -1362,7 +1370,9 @@ def test_categories_aggregated_text(pipeline_class, estimator, fraud_100): @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100): X, y = fraud_100 - X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) + X.ww.init( + logical_types={"currency": "categorical", "expiration_date": "categorical"} + ) pipeline = pipeline_class( component_graph=[ @@ -1417,7 +1427,9 @@ def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100): @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) def test_categories_aggregated_pca_dag(pipeline_class, estimator, fraud_100): X, y = fraud_100 - X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) + X.ww.init( + logical_types={"currency": "categorical", "expiration_date": "categorical"} + ) component_graph = { "SelectNumeric": ["Select Columns Transformer", "X", "y"], @@ -1477,7 +1489,9 @@ def test_categories_aggregated_but_not_those_that_are_dropped( pipeline_class, estimator, fraud_100 ): X, y = fraud_100 - X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) + X.ww.init( + logical_types={"currency": "categorical", "expiration_date": "categorical"} + ) component_graph = [ "Select Columns Transformer", @@ -1521,7 +1535,9 @@ def test_categories_aggregated_when_some_are_dropped( pipeline_class, estimator, fraud_100 ): X, y = fraud_100 - X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) + X.ww.init( + logical_types={"currency": "categorical", "expiration_date": "categorical"} + ) component_graph = [ "Select Columns Transformer", @@ -1627,7 +1643,9 @@ def test_explain_predictions_oversampler(estimator, fraud_100): reason="Skipping test because imbalanced-learn not installed", ) X, y = fraud_100 - X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) + X.ww.init( + logical_types={"currency": "categorical", "expiration_date": "categorical"} + ) pipeline = BinaryClassificationPipeline( component_graph={ "Imputer": ["Imputer", "X", "y"], diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py index 8af3f12b70..f045dbc776 100644 --- a/evalml/tests/model_understanding_tests/test_partial_dependence.py +++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py @@ -136,15 +136,15 @@ def test_partial_dependence_with_non_numeric_columns( ): X = pd.DataFrame( { - "numeric": [1, 2, 3, 0]*4, - "also numeric": [2, 3, 4, 1]*4, - "string": ["a", "b", "a", "c"]*4, - "also string": ["c", "b", "a", "c"]*4, + "numeric": [1, 2, 3, 0] * 4, + "also numeric": [2, 3, 4, 1] * 4, + "string": ["a", "b", "a", "c"] * 4, + "also string": ["c", "b", "a", "c"] * 4, } ) if data_type == "ww": X.ww.init() - y = [0, 0.2, 1.4, 1]*4 + y = [0, 0.2, 1.4, 1] * 4 pipeline = linear_regression_pipeline_class( parameters={"Linear Regressor": {"n_jobs": 1}} ) @@ -185,11 +185,11 @@ def test_partial_dependence_catboost( if problem_type == ProblemTypes.BINARY: X, y = X_y_binary - y_small = ["a", "b", "a"]*5 + y_small = ["a", "b", "a"] * 5 pipeline_class = BinaryClassificationPipeline else: X, y = X_y_multi - y_small = ["a", "b", "c"]*5 + y_small = ["a", "b", "c"] * 5 pipeline_class = MulticlassClassificationPipeline pipeline = pipeline_class( @@ -204,10 +204,10 @@ def test_partial_dependence_catboost( # test that CatBoost can natively handle non-numerical columns as feature passed to partial_dependence X = pd.DataFrame( { - "numeric": [1, 2, 3]*5, - "also numeric": [2, 3, 4]*5, - "string": ["a", "b", "c"]*5, - "also string": ["c", "b", "a"]*5, + "numeric": [1, 2, 3] * 5, + "also numeric": [2, 3, 4] * 5, + "string": ["a", "b", "c"] * 5, + "also string": ["c", "b", "a"] * 5, } ) pipeline = pipeline_class( diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py index dfbe0f1c35..35ebcd9266 100644 --- a/evalml/tests/model_understanding_tests/test_permutation_importance.py +++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py @@ -310,7 +310,9 @@ def test_fast_permutation_importance_matches_slow_output( "dependency not installed." ) X, y = fraud_100 - X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) + X.ww.init( + logical_types={"currency": "categorical", "expiration_date": "categorical"} + ) if pipeline_class == LinearPipelineWithTextFeatures: X.ww.set_types(logical_types={"provider": "NaturalLanguage"}) diff --git a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py index cd83f08b7d..88fcb2c6ac 100644 --- a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py +++ b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py @@ -86,9 +86,14 @@ def test_woodwork_regression_pipeline(diabetes_local, linear_regression_pipeline def test_custom_indices(): X = pd.DataFrame( - {"a": ["a", "b", "a", "a", "a", "c", "c", "c"]*3, "b": [0, 1, 1, 1, 1, 1, 0, 1]*3} + { + "a": ["a", "b", "a", "a", "a", "c", "c", "c"] * 3, + "b": [0, 1, 1, 1, 1, 1, 0, 1] * 3, + } + ) + y = pd.Series( + [0, 0, 0, 1, 0, 1, 0, 0] * 3, index=np.random.choice(24, 24, replace=False) ) - y = pd.Series([0, 0, 0, 1, 0, 1, 0, 0]*3, index=np.random.choice(24, 24, replace=False)) x1, x2, y1, y2 = split_data(X, y, problem_type="regression") pipeline = RegressionPipeline( component_graph=["Imputer", "One Hot Encoder", "Linear Regressor"], diff --git a/evalml/tests/pipeline_tests/test_component_graph.py b/evalml/tests/pipeline_tests/test_component_graph.py index 4246d0a028..253e212db3 100644 --- a/evalml/tests/pipeline_tests/test_component_graph.py +++ b/evalml/tests/pipeline_tests/test_component_graph.py @@ -1018,7 +1018,12 @@ def test_component_graph_dataset_with_different_types(): y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) X = infer_feature_types( - X, {"column_1": "categorical", "column_2": "categorical", "column_5": "NaturalLanguage"} + X, + { + "column_1": "categorical", + "column_2": "categorical", + "column_5": "NaturalLanguage", + }, ) component_graph = ComponentGraph(graph) @@ -1266,7 +1271,9 @@ def transform(self, X, y=None): y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) # woodwork would infer this as boolean by default -- convert to a numeric type - X.ww.init(logical_types={"column_1": "categorical"}, semantic_tags={"address": "address"}) + X.ww.init( + logical_types={"column_1": "categorical"}, semantic_tags={"address": "address"} + ) component_graph = ComponentGraph(graph) # we don't have feature type selectors defined yet, so in order for the above graph to work we have to @@ -1336,7 +1343,9 @@ def test_component_graph_types_merge(): X["column_5"] = X["column_4"] X["column_6"] = [42.0] * len(X) y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0]) - X = infer_feature_types(X, {"column_1": "categorical", "column_5": "NaturalLanguage"}) + X = infer_feature_types( + X, {"column_1": "categorical", "column_5": "NaturalLanguage"} + ) component_graph = ComponentGraph(graph) # we don't have feature type selectors defined yet, so in order for the above graph to work we have to diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 0574a30e58..eaf0a448e4 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -48,9 +48,10 @@ def _get_test_data_from_configuration( ): X_all = pd.DataFrame( { - "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]*2, + "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] + * 2, "numerical": range(14), - "categorical": ["a", "b", "a", "b", "b", "a", "b"]*2, + "categorical": ["a", "b", "a", "b", "b", "a", "b"] * 2, "dates": pd.date_range("2000-02-03", periods=14, freq="W"), "text": [ "this is a string", @@ -60,7 +61,8 @@ def _get_test_data_from_configuration( "cats are gr8", "hello world", "evalml is gr8", - ]*2, + ] + * 2, "email": [ "abalone_0@gmail.com", "AbaloneRings@yahoo.com", @@ -69,7 +71,8 @@ def _get_test_data_from_configuration( "fooEMAIL@email.org", "evalml@evalml.org", "evalml@alteryx.org", - ]*2, + ] + * 2, "url": [ "https://evalml.alteryx.com/en/stable/", "https://woodwork.alteryx.com/en/stable/guides/statistical_insights.html", @@ -78,17 +81,18 @@ def _get_test_data_from_configuration( "https://www.evalml.alteryx.com/en/stable/demos/text_input.html", "https://github.com/alteryx/evalml", "https://github.com/alteryx/featuretools", - ]*2, + ] + * 2, } ) - y = pd.Series([0, 0, 1, 0, 0, 1, 1]*2) + y = pd.Series([0, 0, 1, 0, 0, 1, 1] * 2) if problem_type == ProblemTypes.MULTICLASS: - y = pd.Series([0, 2, 1, 2, 0, 2, 1]*2) + y = pd.Series([0, 2, 1, 2, 0, 2, 1] * 2) elif is_regression(problem_type): if lognormal_distribution: - y = pd.Series([1, 1, 1, 2, 3, 6, 9]*2) + y = pd.Series([1, 1, 1, 2, 3, 6, 9] * 2) else: - y = pd.Series([1, 2, 3, 3, 3, 4, 5]*2) + y = pd.Series([1, 2, 3, 3, 3, 4, 5] * 2) X = X_all[column_names] if input_type == "ww": From 644fa65d1ffdad5284fa7343b53581d972b57b75 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 13:55:30 +0100 Subject: [PATCH 23/36] lgbm, partial dep, permutation importance --- evalml/tests/component_tests/test_lgbm_regressor.py | 4 ++++ .../model_understanding_tests/test_partial_dependence.py | 1 + .../model_understanding_tests/test_permutation_importance.py | 1 + 3 files changed, 6 insertions(+) diff --git a/evalml/tests/component_tests/test_lgbm_regressor.py b/evalml/tests/component_tests/test_lgbm_regressor.py index 1390cd1e0d..e6beda8bdb 100644 --- a/evalml/tests/component_tests/test_lgbm_regressor.py +++ b/evalml/tests/component_tests/test_lgbm_regressor.py @@ -150,6 +150,8 @@ def test_multiple_fit(mock_predict): X1_fit = pd.DataFrame({"feature": ["a", "b", "c", "c"]}) X1_predict = pd.DataFrame({"feature": ["a", "a", "b", "c"]}) X1_predict_expected = pd.DataFrame({0: [0.0, 0.0, 1.0, 2.0]}, dtype="category") + X1_fit.ww.init(logical_types={"feature": "categorical"}) + X1_predict.ww.init(logical_types={"feature": "categorical"}) clf = LightGBMRegressor() clf.fit(X1_fit, y) @@ -160,6 +162,8 @@ def test_multiple_fit(mock_predict): X2_fit = pd.DataFrame({"feature": ["c", "b", "a", "d"]}) X2_predict = pd.DataFrame({"feature": ["d", "c", "b", "a"]}) X2_predict_expected = pd.DataFrame({0: [3.0, 2.0, 1.0, 0.0]}, dtype="category") + X2_fit.ww.init(logical_types={"feature": "categorical"}) + X2_predict.ww.init(logical_types={"feature": "categorical"}) clf = LightGBMRegressor() clf.fit(X2_fit, y) diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py index f045dbc776..3e95e3261c 100644 --- a/evalml/tests/model_understanding_tests/test_partial_dependence.py +++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py @@ -1139,6 +1139,7 @@ def test_partial_dependence_respect_grid_resolution(fraud_100): "Random Forest Classifier", ] ) + X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) pl.fit(X, y) dep = partial_dependence(pl, X, features="amount", grid_resolution=5) diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py index 35ebcd9266..4cf8bbd319 100644 --- a/evalml/tests/model_understanding_tests/test_permutation_importance.py +++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py @@ -626,6 +626,7 @@ def test_permutation_importance_oversampler(fraud_100): ], } ) + X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) pipeline.fit(X=X, y=y) pipeline.predict(X) importance = calculate_permutation_importance( From a3c5766d684d14fa22885427a798b74dcb96be19 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 13:55:58 +0100 Subject: [PATCH 24/36] lint fixes --- .../model_understanding_tests/test_partial_dependence.py | 4 +++- .../model_understanding_tests/test_permutation_importance.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py index 3e95e3261c..a51638fae4 100644 --- a/evalml/tests/model_understanding_tests/test_partial_dependence.py +++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py @@ -1139,7 +1139,9 @@ def test_partial_dependence_respect_grid_resolution(fraud_100): "Random Forest Classifier", ] ) - X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) + X.ww.init( + logical_types={"currency": "categorical", "expiration_date": "categorical"} + ) pl.fit(X, y) dep = partial_dependence(pl, X, features="amount", grid_resolution=5) diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py index 4cf8bbd319..09e673662d 100644 --- a/evalml/tests/model_understanding_tests/test_permutation_importance.py +++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py @@ -626,7 +626,9 @@ def test_permutation_importance_oversampler(fraud_100): ], } ) - X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"}) + X.ww.init( + logical_types={"currency": "categorical", "expiration_date": "categorical"} + ) pipeline.fit(X=X, y=y) pipeline.predict(X) importance = calculate_permutation_importance( From b9cdc83a97305b2c277ed317e2d63a243685e66b Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 17:06:14 +0100 Subject: [PATCH 25/36] delayed features --- .../test_delayed_features_transformer.py | 61 ++++++------------- 1 file changed, 17 insertions(+), 44 deletions(-) diff --git a/evalml/tests/component_tests/test_delayed_features_transformer.py b/evalml/tests/component_tests/test_delayed_features_transformer.py index 925556c100..584f17ca0e 100644 --- a/evalml/tests/component_tests/test_delayed_features_transformer.py +++ b/evalml/tests/component_tests/test_delayed_features_transformer.py @@ -86,6 +86,8 @@ def test_delayed_feature_extractor_maxdelay3_gap1( answer["feature"] = X.feature.astype("int64") if not encode_y_as_str: answer["target_delay_0"] = y_answer.astype("int64") + else: + y = y.astype("category") assert_frame_equal( answer, DelayedFeatureTransformer(max_delay=3, gap=1).fit_transform(X=X, y=y) @@ -130,6 +132,8 @@ def test_delayed_feature_extractor_maxdelay5_gap1( "target_delay_5": y_answer.shift(5), } ) + if encode_y_as_str: + y = y.astype("category") if not encode_X_as_str: answer["feature"] = X.feature.astype("int64") assert_frame_equal( @@ -173,6 +177,8 @@ def test_delayed_feature_extractor_maxdelay3_gap7( "target_delay_3": y_answer.shift(3), } ) + if encode_y_as_str: + y = y.astype("category") if not encode_X_as_str: answer["feature"] = X.feature.astype("int64") assert_frame_equal( @@ -193,49 +199,6 @@ def test_delayed_feature_extractor_maxdelay3_gap7( ) -@pytest.mark.parametrize("encode_X_as_str", [True, False]) -@pytest.mark.parametrize("encode_y_as_str", [True, False]) -def test_delayed_feature_extractor_numpy( - encode_X_as_str, encode_y_as_str, delayed_features_data -): - X, y = delayed_features_data - X, X_answer, y, y_answer = encode_X_y_as_strings( - X, y, encode_X_as_str, encode_y_as_str - ) - X_np = X.values - y_np = y.values - answer = pd.DataFrame( - { - 0: X.feature, - "0_delay_1": X_answer.feature.shift(1), - "0_delay_2": X_answer.feature.shift(2), - "0_delay_3": X_answer.feature.shift(3), - "target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3), - } - ) - if not encode_X_as_str: - answer[0] = X.feature.astype("int64") - assert_frame_equal( - answer, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X_np, y_np) - ) - - answer_only_y = pd.DataFrame( - { - "target_delay_0": y_answer.astype("int64"), - "target_delay_1": y_answer.shift(1), - "target_delay_2": y_answer.shift(2), - "target_delay_3": y_answer.shift(3), - } - ) - assert_frame_equal( - answer_only_y, - DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y_np), - ) - - @pytest.mark.parametrize( "delay_features,delay_target", [(False, True), (True, False), (False, False)] ) @@ -264,6 +227,8 @@ def test_lagged_feature_extractor_delay_features_delay_target( "target_delay_3": y_answer.shift(3), } ) + if encode_y_as_str: + y = y.astype("category") if not encode_X_as_str: all_delays["feature"] = X.feature.astype("int64") if not delay_features: @@ -307,7 +272,8 @@ def test_lagged_feature_extractor_delay_target( "target_delay_3": y_answer.shift(3), } ) - + if encode_y_as_str: + y = y.astype("category") transformer = DelayedFeatureTransformer( max_delay=3, gap=1, delay_features=delay_features, delay_target=delay_target ) @@ -372,6 +338,8 @@ def test_delay_feature_transformer_supports_custom_index( X = make_data_type(data_type, X) y = make_data_type(data_type, y) + if encode_y_as_str: + y = y.astype("category") assert_frame_equal( answer, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X, y) @@ -407,6 +375,7 @@ def test_delay_feature_transformer_multiple_categorical_columns(delayed_features "target_delay_1": y_answer.shift(1), } ) + y = y.astype("category") assert_frame_equal( answer, DelayedFeatureTransformer(max_delay=1, gap=11).fit_transform(X, y) ) @@ -469,9 +438,13 @@ def test_delay_feature_transformer_woodwork_custom_overrides_returned_by_compone dft.fit(X, y) transformed = dft.transform(X, y) assert isinstance(transformed, pd.DataFrame) + + if logical_type == Boolean: + transformed.ww.init(logical_types={"0_delay_1": "categorical"}) transformed_logical_types = { k: type(v) for k, v in transformed.ww.logical_types.items() } + if logical_type in [Integer, Double, Categorical]: assert transformed_logical_types == { 0: logical_type, From eb0cca3705262723560a1cb65d0dce6b6cdd7cc1 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 18:49:08 +0100 Subject: [PATCH 26/36] email featurizer fix --- .../preprocessing/transform_primitive_components.py | 9 +-------- evalml/tests/conftest.py | 1 - .../model_understanding_tests/test_partial_dependence.py | 1 - 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py index 17a2356bc3..ac93c18806 100644 --- a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py +++ b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py @@ -72,20 +72,13 @@ def transform(self, X, y=None): es = self._make_entity_set(X_ww) features = ft.calculate_feature_matrix(features=self._features, entityset=es) - features.set_index(X_ww.index, inplace=True) X_ww = X_ww.ww.drop(self._columns) + features.ww.init(logical_types={col_: "categorical" for col_ in features}) for col in features: X_ww.ww[col] = features[col] - all_created_columns = self._get_feature_provenance().values() - to_categorical = { - col: "Categorical" - for feature_list in all_created_columns - for col in feature_list - } - X_ww.ww.set_types(to_categorical) return X_ww @staticmethod diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 2572dc97fb..c1a96c1fd4 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -43,7 +43,6 @@ handle_problem_types, is_regression, ) -from evalml.utils import infer_feature_types def pytest_configure(config): diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py index a51638fae4..68d1574bc3 100644 --- a/evalml/tests/model_understanding_tests/test_partial_dependence.py +++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py @@ -21,7 +21,6 @@ RegressionPipeline, ) from evalml.problem_types import ProblemTypes -from evalml.utils import infer_feature_types @pytest.fixture From 3fb872e26ad929ab9bfe6b7087df03b53b19baf2 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 30 Aug 2021 19:23:29 +0100 Subject: [PATCH 27/36] per column imputer --- .../tests/component_tests/test_per_column_imputer.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/evalml/tests/component_tests/test_per_column_imputer.py b/evalml/tests/component_tests/test_per_column_imputer.py index f97390f5d3..6d163a53f8 100644 --- a/evalml/tests/component_tests/test_per_column_imputer.py +++ b/evalml/tests/component_tests/test_per_column_imputer.py @@ -128,7 +128,7 @@ def test_non_numeric_errors(non_numeric_df): def test_non_numeric_valid(non_numeric_df): - X = non_numeric_df + X = non_numeric_df.copy() X.ww.init( logical_types={ "A": "categorical", @@ -153,10 +153,19 @@ def test_non_numeric_valid(non_numeric_df): X_t = transformer.fit_transform(X) assert_frame_equal(X_expected, X_t) + X = non_numeric_df.copy() # constant with all strings strategies = {"D": {"impute_strategy": "constant", "fill_value": 100}} transformer = PerColumnImputer(impute_strategies=strategies) + X.ww.init( + logical_types={ + "A": "categorical", + "B": "categorical", + "C": "categorical", + "D": "categorical", + } + ) X_expected = pd.DataFrame( [ ["a", "a", "a", "a"], From 8130180d499a90ed6be8e461e2ecc2027ce1fe72 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 31 Aug 2021 13:49:35 +0100 Subject: [PATCH 28/36] change fraud100 --- evalml/tests/conftest.py | 9 ++++++++- .../test_explainers.py | 20 ------------------- .../test_partial_dependence.py | 3 --- .../test_permutation_importance.py | 6 ------ 4 files changed, 8 insertions(+), 30 deletions(-) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index fe1d0f4ef5..f51df67529 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -990,7 +990,14 @@ def fraud_local(): @pytest.fixture def fraud_100(): X, y = load_fraud_local(n_rows=100) - X.ww.set_types(logical_types={"provider": "Categorical", "region": "Categorical"}) + X.ww.set_types( + logical_types={ + "provider": "Categorical", + "region": "Categorical", + "currency": "categorical", + "expiration_date": "categorical", + } + ) return X, y diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py index 9098b795dd..8d651427e1 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py @@ -1263,9 +1263,6 @@ def transform_y_for_problem_type(problem_type, y): @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) def test_categories_aggregated_linear_pipeline(pipeline_class, estimator, fraud_100): X, y = fraud_100 - X.ww.init( - logical_types={"currency": "categorical", "expiration_date": "categorical"} - ) pipeline = pipeline_class( component_graph=[ @@ -1310,8 +1307,6 @@ def test_categories_aggregated_text(pipeline_class, estimator, fraud_100): X.ww.set_types( logical_types={ "provider": "NaturalLanguage", - "currency": "categorical", - "expiration_date": "categorical", } ) component_graph = [ @@ -1370,9 +1365,6 @@ def test_categories_aggregated_text(pipeline_class, estimator, fraud_100): @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100): X, y = fraud_100 - X.ww.init( - logical_types={"currency": "categorical", "expiration_date": "categorical"} - ) pipeline = pipeline_class( component_graph=[ @@ -1427,9 +1419,6 @@ def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100): @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases) def test_categories_aggregated_pca_dag(pipeline_class, estimator, fraud_100): X, y = fraud_100 - X.ww.init( - logical_types={"currency": "categorical", "expiration_date": "categorical"} - ) component_graph = { "SelectNumeric": ["Select Columns Transformer", "X", "y"], @@ -1489,9 +1478,6 @@ def test_categories_aggregated_but_not_those_that_are_dropped( pipeline_class, estimator, fraud_100 ): X, y = fraud_100 - X.ww.init( - logical_types={"currency": "categorical", "expiration_date": "categorical"} - ) component_graph = [ "Select Columns Transformer", @@ -1535,9 +1521,6 @@ def test_categories_aggregated_when_some_are_dropped( pipeline_class, estimator, fraud_100 ): X, y = fraud_100 - X.ww.init( - logical_types={"currency": "categorical", "expiration_date": "categorical"} - ) component_graph = [ "Select Columns Transformer", @@ -1643,9 +1626,6 @@ def test_explain_predictions_oversampler(estimator, fraud_100): reason="Skipping test because imbalanced-learn not installed", ) X, y = fraud_100 - X.ww.init( - logical_types={"currency": "categorical", "expiration_date": "categorical"} - ) pipeline = BinaryClassificationPipeline( component_graph={ "Imputer": ["Imputer", "X", "y"], diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py index 68d1574bc3..1942a8e64a 100644 --- a/evalml/tests/model_understanding_tests/test_partial_dependence.py +++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py @@ -1138,9 +1138,6 @@ def test_partial_dependence_respect_grid_resolution(fraud_100): "Random Forest Classifier", ] ) - X.ww.init( - logical_types={"currency": "categorical", "expiration_date": "categorical"} - ) pl.fit(X, y) dep = partial_dependence(pl, X, features="amount", grid_resolution=5) diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py index b210ec22f2..f4e31d561b 100644 --- a/evalml/tests/model_understanding_tests/test_permutation_importance.py +++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py @@ -310,9 +310,6 @@ def test_fast_permutation_importance_matches_slow_output( "dependency not installed." ) X, y = fraud_100 - X.ww.init( - logical_types={"currency": "categorical", "expiration_date": "categorical"} - ) if pipeline_class == LinearPipelineWithTextFeatures: X.ww.set_types(logical_types={"provider": "NaturalLanguage"}) @@ -626,9 +623,6 @@ def test_permutation_importance_oversampler(fraud_100): ], } ) - X.ww.init( - logical_types={"currency": "categorical", "expiration_date": "categorical"} - ) pipeline.fit(X=X, y=y) pipeline.predict(X) importance = calculate_permutation_importance( From 3b68cab0d72175d5920d932cade2cf3df2cd13de Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 31 Aug 2021 15:18:36 +0100 Subject: [PATCH 29/36] permutation importance --- evalml/model_understanding/permutation_importance.py | 1 + evalml/pipelines/components/transformers/column_selectors.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/evalml/model_understanding/permutation_importance.py b/evalml/model_understanding/permutation_importance.py index a2420e63f9..e49053f212 100644 --- a/evalml/model_understanding/permutation_importance.py +++ b/evalml/model_understanding/permutation_importance.py @@ -293,6 +293,7 @@ def _shuffle_and_score_helper( col = X_permuted.iloc[shuffling_idx, col_idx] col.index = X_permuted.index X_permuted.iloc[:, col_idx] = col + X_permuted.ww.init(schema=X_features.ww.schema) if is_fast: feature_score = scorer(pipeline, X_permuted, X_features, y, objective) else: diff --git a/evalml/pipelines/components/transformers/column_selectors.py b/evalml/pipelines/components/transformers/column_selectors.py index 67a2bb66f5..52b46121a8 100644 --- a/evalml/pipelines/components/transformers/column_selectors.py +++ b/evalml/pipelines/components/transformers/column_selectors.py @@ -31,7 +31,7 @@ def _check_input_for_columns(self, X): missing_cols = set(cols) - set(column_names) if missing_cols: - raise ValueError("Columns of type {column_types} not found in input data.") + raise ValueError(f"Columns of type {missing_cols} not found in input data.") @abstractmethod def _modify_columns(self, cols, X, y=None): From 9128d9c8ae0f6fa88b117f46c31ee02f59cd4b88 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 31 Aug 2021 15:34:40 +0100 Subject: [PATCH 30/36] model_understanding docs update --- docs/source/user_guide/model_understanding.ipynb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/user_guide/model_understanding.ipynb b/docs/source/user_guide/model_understanding.ipynb index 570351bf67..ad5b910c8e 100644 --- a/docs/source/user_guide/model_understanding.ipynb +++ b/docs/source/user_guide/model_understanding.ipynb @@ -154,7 +154,8 @@ "outputs": [], "source": [ "X_fraud, y_fraud = evalml.demos.load_fraud(100, verbose=False)\n", - "X_fraud.ww.init(logical_types={\"provider\": \"Categorical\", 'region': \"Categorical\"})\n", + "X_fraud.ww.init(logical_types={\"provider\": \"Categorical\", 'region': \"Categorical\",\n", + " \"currency\": \"Categorical\", \"expiration_date\": \"Categorical\"})\n", "\n", "fraud_pipeline = BinaryClassificationPipeline([\"DateTime Featurization Component\",\"One Hot Encoder\", \"Random Forest Classifier\"])\n", "fraud_pipeline.fit(X_fraud, y_fraud)\n", From 6fcf20524255c91e04388960c1d40d15cf2ee673 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 31 Aug 2021 16:16:00 +0100 Subject: [PATCH 31/36] data check update --- .../test_target_leakage_data_check.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py index 828a7e186d..9ffb4c3e83 100644 --- a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py +++ b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py @@ -378,6 +378,7 @@ def test_target_leakage_regression(): X["c"] = y / 10 X["d"] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] X["e"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"] + X.ww.init(logical_types={"e": "categorical"}) expected = { "warnings": [ @@ -399,6 +400,12 @@ def test_target_leakage_regression(): message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "c"}, ).to_dict(), + DataCheckWarning( + message="Column 'e' is 80.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"column": "e"}, + ).to_dict(), ], "errors": [], "actions": [ @@ -411,6 +418,9 @@ def test_target_leakage_regression(): DataCheckAction( DataCheckActionCode.DROP_COL, metadata={"column": "c"} ).to_dict(), + DataCheckAction( + DataCheckActionCode.DROP_COL, metadata={"column": "e"} + ).to_dict(), ], } From caefd12e6110c23ad976ed98bb507d7ef26b392c Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 31 Aug 2021 16:32:36 +0100 Subject: [PATCH 32/36] update objectives --- docs/source/user_guide/objectives.ipynb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/user_guide/objectives.ipynb b/docs/source/user_guide/objectives.ipynb index c78ed15f02..b79dc9f6ce 100644 --- a/docs/source/user_guide/objectives.ipynb +++ b/docs/source/user_guide/objectives.ipynb @@ -69,7 +69,8 @@ "from evalml.objectives import F1\n", "\n", "X, y = load_fraud(n_rows=100)\n", - "X.ww.init(logical_types={\"provider\": \"Categorical\", \"region\": \"Categorical\"})\n", + "X.ww.init(logical_types={\"provider\": \"Categorical\", \"region\": \"Categorical\",\n", + " \"currency\": \"Categorical\", \"expiration_date\": \"Categorical\"})\n", "objective = F1()\n", "pipeline = BinaryClassificationPipeline(component_graph=['Simple Imputer', 'DateTime Featurization Component', 'One Hot Encoder', 'Random Forest Classifier'])\n", "pipeline.fit(X, y)\n", From a562c67ce9acdd001b235ab8de7e70de5dfc40d6 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 31 Aug 2021 20:59:56 +0100 Subject: [PATCH 33/36] test updates --- .../test_delayed_features_transformer.py | 40 +++++++++++++++++++ .../component_tests/test_lgbm_classifier.py | 8 ++-- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/evalml/tests/component_tests/test_delayed_features_transformer.py b/evalml/tests/component_tests/test_delayed_features_transformer.py index 584f17ca0e..2321a98693 100644 --- a/evalml/tests/component_tests/test_delayed_features_transformer.py +++ b/evalml/tests/component_tests/test_delayed_features_transformer.py @@ -199,6 +199,46 @@ def test_delayed_feature_extractor_maxdelay3_gap7( ) +def test_delayed_feature_extractor_numpy( + delayed_features_data +): + X, y = delayed_features_data + X, X_answer, y, y_answer = encode_X_y_as_strings( + X, y, False, False + ) + X_np = X.values + y_np = y.values + answer = pd.DataFrame( + { + 0: X.feature, + "0_delay_1": X_answer.feature.shift(1), + "0_delay_2": X_answer.feature.shift(2), + "0_delay_3": X_answer.feature.shift(3), + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + } + ) + + assert_frame_equal( + answer, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X_np, y_np) + ) + + answer_only_y = pd.DataFrame( + { + "target_delay_0": y_answer.astype("int64"), + "target_delay_1": y_answer.shift(1), + "target_delay_2": y_answer.shift(2), + "target_delay_3": y_answer.shift(3), + } + ) + assert_frame_equal( + answer_only_y, + DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y_np), + ) + + @pytest.mark.parametrize( "delay_features,delay_target", [(False, True), (True, False), (False, False)] ) diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py index 2a35d0ba7c..5784029a94 100644 --- a/evalml/tests/component_tests/test_lgbm_classifier.py +++ b/evalml/tests/component_tests/test_lgbm_classifier.py @@ -174,14 +174,14 @@ def test_correct_args(mock_predict, mock_predict_proba, X_y_binary): def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary): X = pd.DataFrame( { - "feature_1": [0, 0, 1, 1, 0, 1] * 2, - "feature_2": ["a", "a", "b", "b", "c", "c"] * 2, + "feature_1": [0, 0, 1, 1, 0, 1], + "feature_2": ["a", "a", "b", "b", "c", "c"], } ) X.ww.init(logical_types={"feature_2": "categorical"}) - y = pd.Series([1, 1, 0, 0, 0, 1] * 2) + y = pd.Series([1, 1, 0, 0, 0, 1]) X_expected = pd.DataFrame( - {0: [0, 0, 1, 1, 0, 1] * 2, 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0] * 2} + {0: [0, 0, 1, 1, 0, 1], 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]} ) X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype("category") From 482a0d6fcfb3d9a884562b9bb9b34c6b0ab746bc Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 31 Aug 2021 21:25:16 +0100 Subject: [PATCH 34/36] more updates --- .../component_tests/test_delayed_features_transformer.py | 8 ++------ .../prediction_explanations_tests/test_force_plots.py | 1 - 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/evalml/tests/component_tests/test_delayed_features_transformer.py b/evalml/tests/component_tests/test_delayed_features_transformer.py index 2321a98693..b4245a3a8a 100644 --- a/evalml/tests/component_tests/test_delayed_features_transformer.py +++ b/evalml/tests/component_tests/test_delayed_features_transformer.py @@ -199,13 +199,9 @@ def test_delayed_feature_extractor_maxdelay3_gap7( ) -def test_delayed_feature_extractor_numpy( - delayed_features_data -): +def test_delayed_feature_extractor_numpy(delayed_features_data): X, y = delayed_features_data - X, X_answer, y, y_answer = encode_X_y_as_strings( - X, y, False, False - ) + X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, False, False) X_np = X.values y_np = y.values answer = pd.DataFrame( diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py index b08f8ccc26..48b864a616 100644 --- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py +++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py @@ -217,7 +217,6 @@ def test_force_plot_regression( def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100): X, y = fraud_100 columns_to_select = ["datetime", "amount", "provider", "currency"] - X.ww.init(logical_types={"currency": "categorical"}) pipeline = pipeline_class( component_graph=[ From 3a353ea1bc664cd212deddeea1d9376237767dfe Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 31 Aug 2021 22:04:34 +0100 Subject: [PATCH 35/36] featuretools upgrade --- evalml/tests/component_tests/test_imputer.py | 1 - .../component_tests/test_one_hot_encoder.py | 76 +++++-------------- .../latest_dependency_versions.txt | 2 +- 3 files changed, 21 insertions(+), 58 deletions(-) diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py index 608f62c02b..7024e05807 100644 --- a/evalml/tests/component_tests/test_imputer.py +++ b/evalml/tests/component_tests/test_imputer.py @@ -144,7 +144,6 @@ def test_categorical_only_input(imputer_test_data): ) imputer = Imputer() - imputer.fit(X, y) transformed = imputer.fit_transform(X, y) assert_frame_equal(transformed, expected, check_dtype=False) diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py index 36a5cad415..9f5ba9c1e5 100644 --- a/evalml/tests/component_tests/test_one_hot_encoder.py +++ b/evalml/tests/component_tests/test_one_hot_encoder.py @@ -17,6 +17,17 @@ from evalml.utils import get_random_seed, infer_feature_types +def set_first_three_columns_to_categorical(X): + X.ww.init( + logical_types={ + "col_1": "categorical", + "col_2": "categorical", + "col_3": "categorical", + } + ) + return X + + def test_init(): parameters = { "top_n": 10, @@ -166,13 +177,7 @@ def test_drop_first(): "col_3": ["a", "a", "a", "a", "a"], } ) - X.ww.init( - logical_types={ - "col_1": "categorical", - "col_2": "categorical", - "col_3": "categorical", - } - ) + X = set_first_three_columns_to_categorical(X) encoder = OneHotEncoder(top_n=None, drop="first", handle_unknown="error") encoder.fit(X) X_t = encoder.transform(X) @@ -181,6 +186,7 @@ def test_drop_first(): assert col_names == expected_col_names + def test_drop_binary(): X = pd.DataFrame( { @@ -189,13 +195,7 @@ def test_drop_binary(): "col_3": ["a", "a", "a", "a", "a"], } ) - X.ww.init( - logical_types={ - "col_1": "categorical", - "col_2": "categorical", - "col_3": "categorical", - } - ) + X = set_first_three_columns_to_categorical(X) encoder = OneHotEncoder(top_n=None, drop="if_binary", handle_unknown="error") encoder.fit(X) X_t = encoder.transform(X) @@ -212,13 +212,7 @@ def test_drop_parameter_is_array(): "col_3": ["a", "a", "a", "a", "a"], } ) - X.ww.init( - logical_types={ - "col_1": "categorical", - "col_2": "categorical", - "col_3": "categorical", - } - ) + X = set_first_three_columns_to_categorical(X) encoder = OneHotEncoder(top_n=None, drop=["b", "c", "a"], handle_unknown="error") encoder.fit(X) X_t = encoder.transform(X) @@ -237,13 +231,7 @@ def test_drop_binary_and_top_n_2(): "col_3": ["a", "a", "a", "a", "a"], } ) - X.ww.init( - logical_types={ - "col_1": "categorical", - "col_2": "categorical", - "col_3": "categorical", - } - ) + X = set_first_three_columns_to_categorical(X) encoder = OneHotEncoder(top_n=2, drop="if_binary") encoder.fit(X) X_t = encoder.transform(X) @@ -261,13 +249,7 @@ def test_handle_unknown(): "col_4": [2, 0, 1, 3, 0, 1, 2], } ) - X.ww.init( - logical_types={ - "col_1": "categorical", - "col_2": "categorical", - "col_3": "categorical", - } - ) + X = set_first_three_columns_to_categorical(X) encoder = OneHotEncoder(handle_unknown="error") encoder.fit(X) assert isinstance(encoder.transform(X), pd.DataFrame) @@ -334,13 +316,7 @@ def test_categories(): "col_4": [2, 0, 1, 3, 0, 1, 2], } ) - X.ww.init( - logical_types={ - "col_1": "categorical", - "col_2": "categorical", - "col_3": "categorical", - } - ) + X = set_first_three_columns_to_categorical(X) categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]] @@ -415,13 +391,7 @@ def test_more_top_n_unique_values(): "col_4": [2, 0, 1, 3, 0, 1, 2], } ) - X.ww.init( - logical_types={ - "col_1": "categorical", - "col_2": "categorical", - "col_3": "categorical", - } - ) + X = set_first_three_columns_to_categorical(X) random_seed = 2 @@ -464,13 +434,7 @@ def test_more_top_n_unique_values_large(): "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1], } ) - X.ww.init( - logical_types={ - "col_1": "categorical", - "col_2": "categorical", - "col_3": "categorical", - } - ) + X = set_first_three_columns_to_categorical(X) random_seed = 2 encoder = OneHotEncoder(top_n=3, random_seed=random_seed) diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt index 47683d73c1..14a285469e 100644 --- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt +++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt @@ -3,7 +3,7 @@ click==8.0.1 cloudpickle==1.6.0 colorama==0.4.4 dask==2021.8.1 -featuretools==0.26.2 +featuretools==0.27.0 graphviz==0.17 imbalanced-learn==0.8.0 ipywidgets==7.6.3 From 61c91eee8b2c925ee663512dbf8477d093b4b36f Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Tue, 31 Aug 2021 22:08:50 +0100 Subject: [PATCH 36/36] lint fix --- evalml/tests/component_tests/test_one_hot_encoder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py index 9f5ba9c1e5..f382bcd257 100644 --- a/evalml/tests/component_tests/test_one_hot_encoder.py +++ b/evalml/tests/component_tests/test_one_hot_encoder.py @@ -186,7 +186,6 @@ def test_drop_first(): assert col_names == expected_col_names - def test_drop_binary(): X = pd.DataFrame( {