From 012f26c6dfd38b3a5da4974b3fa3dd2457d9148f Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Mon, 10 Jul 2023 14:40:13 +0300 Subject: [PATCH 1/4] Fix redundancy computation in mrmr --- etna/analysis/feature_selection/mrmr_selection.py | 8 ++++++++ .../test_feature_selection/test_mrmr.py | 13 +++++++++++++ 2 files changed, 21 insertions(+) diff --git a/etna/analysis/feature_selection/mrmr_selection.py b/etna/analysis/feature_selection/mrmr_selection.py index ba1d84eb3..2e4e7dc5a 100644 --- a/etna/analysis/feature_selection/mrmr_selection.py +++ b/etna/analysis/feature_selection/mrmr_selection.py @@ -82,6 +82,14 @@ def mrmr( redundancy_table = pd.DataFrame(np.inf, index=all_features, columns=all_features) top_k = min(top_k, len(all_features)) + # can't compute correlation of categorical column with the others + cat_cols = regressors.dtypes[regressors.dtypes == "category"].index + for cat_col in cat_cols: + try: + regressors[cat_col] = regressors[cat_col].astype(float) + except ValueError: + raise ValueError(f"{cat_col} column cannot be cast to float type! Please, use encoders.") + for i in range(top_k): score_numerator = relevance.loc[not_selected_features] score_denominator = pd.Series(1, index=not_selected_features) diff --git a/tests/test_analysis/test_feature_selection/test_mrmr.py b/tests/test_analysis/test_feature_selection/test_mrmr.py index a45190bec..b3f626ff3 100644 --- a/tests/test_analysis/test_feature_selection/test_mrmr.py +++ b/tests/test_analysis/test_feature_selection/test_mrmr.py @@ -32,6 +32,12 @@ def df_with_regressors() -> Dict[str, pd.DataFrame]: regressor = df_regressors_useless[df_regressors_useless["segment"] == segment]["target"].values df_exog[f"regressor_useless_{i}"] = regressor + # useless categorical regressor + num_cat_useless = 3 + for i in range(num_cat_useless): + df_exog[f"categorical_regressor_useless_{i}"] = i + df_exog[f"categorical_regressor_useless_{i}"] = df_exog[f"categorical_regressor_useless_{i}"].astype("category") + # useful regressors: the same as target but with little noise df_regressors_useful = df.copy() sampler = RandomState(seed=2).normal @@ -174,3 +180,10 @@ def test_fast_redundancy_deprecation_warning(df_with_regressors): relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor()) with pytest.warns(DeprecationWarning, match="Option `fast_redundancy=False` was added for backward compatibility"): mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=False) + + +@pytest.mark.parametrize("fast_redundancy", [True, False]) +def test_mrmr_with_categorical_regressor(df_with_regressors, fast_redundancy): + df, regressors = df_with_regressors["df"], df_with_regressors["regressors"] + relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor()) + mrmr(relevance_table=relevance_table, regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy) From 4e3f2326953520831e3664eb3e95d6aa22eb8324 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Mon, 10 Jul 2023 14:44:43 +0300 Subject: [PATCH 2/4] Update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b53ebaae1..cbd644ea9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - - - -- +- `mrmr` feature selection working with categoricals ([#1311](https://github.com/tinkoff-ai/etna/pull/1311)) - ### Removed From 2835efa0037172abce02bb5ecc30ab560af4e172 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Mon, 10 Jul 2023 15:59:58 +0300 Subject: [PATCH 3/4] Review fixes --- .../feature_selection/mrmr_selection.py | 14 ++++---- tests/test_analysis/conftest.py | 35 +++++++++++++++++++ .../test_relevance_table.py | 30 ---------------- .../test_feature_selection/test_mrmr.py | 8 +++++ 4 files changed, 49 insertions(+), 38 deletions(-) diff --git a/etna/analysis/feature_selection/mrmr_selection.py b/etna/analysis/feature_selection/mrmr_selection.py index 2e4e7dc5a..a926b172f 100644 --- a/etna/analysis/feature_selection/mrmr_selection.py +++ b/etna/analysis/feature_selection/mrmr_selection.py @@ -72,6 +72,12 @@ def mrmr( relevance_aggregation_fn = AGGREGATION_FN[AggregationMode(relevance_aggregation_mode)] redundancy_aggregation_fn = AGGREGATION_FN[AggregationMode(redundancy_aggregation_mode)] + # can't compute correlation of categorical column with the others + try: + regressors = regressors.astype(float) + except ValueError as e: + raise ValueError(f"Only convertible to float features are allowed! Error: {str(e)}") + relevance = relevance_table.apply(relevance_aggregation_fn).fillna(0) all_features = relevance.index.to_list() @@ -82,14 +88,6 @@ def mrmr( redundancy_table = pd.DataFrame(np.inf, index=all_features, columns=all_features) top_k = min(top_k, len(all_features)) - # can't compute correlation of categorical column with the others - cat_cols = regressors.dtypes[regressors.dtypes == "category"].index - for cat_col in cat_cols: - try: - regressors[cat_col] = regressors[cat_col].astype(float) - except ValueError: - raise ValueError(f"{cat_col} column cannot be cast to float type! Please, use encoders.") - for i in range(top_k): score_numerator = relevance.loc[not_selected_features] score_denominator = pd.Series(1, index=not_selected_features) diff --git a/tests/test_analysis/conftest.py b/tests/test_analysis/conftest.py index 280cf50e8..3873300a5 100644 --- a/tests/test_analysis/conftest.py +++ b/tests/test_analysis/conftest.py @@ -1,8 +1,43 @@ import matplotlib.pyplot as plt +import numpy as np +import pandas as pd import pytest +from etna.datasets import TSDataset +from etna.datasets import duplicate_data + @pytest.fixture(autouse=True) def close_plots(): yield plt.close() + + +@pytest.fixture +def exog_and_target_dfs(): + seg = ["a"] * 30 + ["b"] * 30 + time = list(pd.date_range("2020-01-01", "2021-01-01")[:30]) + timestamps = time * 2 + target = np.arange(60) + df = pd.DataFrame({"segment": seg, "timestamp": timestamps, "target": target}) + ts = TSDataset.to_dataset(df) + + cast = ["1.1"] * 10 + ["2"] * 9 + [None] + ["56.1"] * 10 + no_cast = ["1.1"] * 10 + ["two"] * 10 + ["56.1"] * 10 + none = [1] * 10 + [2] * 10 + [56.1] * 10 + none[10] = None + df = pd.DataFrame( + { + "timestamp": time, + "exog1": np.arange(100, 70, -1), + "exog2": np.sin(np.arange(30) / 10), + "exog3": np.exp(np.arange(30)), + "cast": cast, + "no_cast": no_cast, + "none": none, + } + ) + df["cast"] = df["cast"].astype("category") + df["no_cast"] = df["no_cast"].astype("category") + df_exog = duplicate_data(df, segments=["a", "b"]) + return ts, df_exog diff --git a/tests/test_analysis/test_feature_relevance/test_relevance_table.py b/tests/test_analysis/test_feature_relevance/test_relevance_table.py index d68bd40ea..6b6e9f8fb 100644 --- a/tests/test_analysis/test_feature_relevance/test_relevance_table.py +++ b/tests/test_analysis/test_feature_relevance/test_relevance_table.py @@ -38,36 +38,6 @@ def test_model_relevance_table(simple_df_relevance): assert relevance_table["regressor_1"]["2"] < relevance_table["regressor_2"]["2"] -@pytest.fixture() -def exog_and_target_dfs(): - seg = ["a"] * 30 + ["b"] * 30 - time = list(pd.date_range("2020-01-01", "2021-01-01")[:30]) - timestamps = time * 2 - target = np.arange(60) - df = pd.DataFrame({"segment": seg, "timestamp": timestamps, "target": target}) - ts = TSDataset.to_dataset(df) - - cast = ["1.1"] * 10 + ["2"] * 9 + [None] + ["56.1"] * 10 - no_cast = ["1.1"] * 10 + ["two"] * 10 + ["56.1"] * 10 - none = [1] * 10 + [2] * 10 + [56.1] * 10 - none[10] = None - df = pd.DataFrame( - { - "timestamp": time, - "exog1": np.arange(100, 70, -1), - "exog2": np.sin(np.arange(30) / 10), - "exog3": np.exp(np.arange(30)), - "cast": cast, - "no_cast": no_cast, - "none": none, - } - ) - df["cast"] = df["cast"].astype("category") - df["no_cast"] = df["no_cast"].astype("category") - df_exog = duplicate_data(df, segments=["a", "b"]) - return ts, df_exog - - @pytest.fixture() def exog_and_target_dfs_with_none(): seg = ["a"] * 30 + ["b"] * 30 diff --git a/tests/test_analysis/test_feature_selection/test_mrmr.py b/tests/test_analysis/test_feature_selection/test_mrmr.py index b3f626ff3..44c52508c 100644 --- a/tests/test_analysis/test_feature_selection/test_mrmr.py +++ b/tests/test_analysis/test_feature_selection/test_mrmr.py @@ -1,4 +1,5 @@ from typing import Dict +from unittest.mock import Mock import numpy as np import pandas as pd @@ -187,3 +188,10 @@ def test_mrmr_with_categorical_regressor(df_with_regressors, fast_redundancy): df, regressors = df_with_regressors["df"], df_with_regressors["regressors"] relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor()) mrmr(relevance_table=relevance_table, regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy) + + +@pytest.mark.parametrize("fast_redundancy", [True, False]) +def test_mrmr_with_uncastable_categorical_regressor_fails(exog_and_target_dfs, fast_redundancy): + df, regressors = exog_and_target_dfs + with pytest.raises(ValueError, match="Only convertible to float features are allowed!"): + mrmr(relevance_table=Mock(), regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy) From 2911e30a40d96a27c87047b469987f379a9c857b Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Mon, 10 Jul 2023 16:57:36 +0300 Subject: [PATCH 4/4] Fix naming --- tests/test_analysis/test_feature_selection/test_mrmr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_analysis/test_feature_selection/test_mrmr.py b/tests/test_analysis/test_feature_selection/test_mrmr.py index 44c52508c..c2caf7c71 100644 --- a/tests/test_analysis/test_feature_selection/test_mrmr.py +++ b/tests/test_analysis/test_feature_selection/test_mrmr.py @@ -33,7 +33,7 @@ def df_with_regressors() -> Dict[str, pd.DataFrame]: regressor = df_regressors_useless[df_regressors_useless["segment"] == segment]["target"].values df_exog[f"regressor_useless_{i}"] = regressor - # useless categorical regressor + # useless categorical regressors num_cat_useless = 3 for i in range(num_cat_useless): df_exog[f"categorical_regressor_useless_{i}"] = i @@ -184,7 +184,7 @@ def test_fast_redundancy_deprecation_warning(df_with_regressors): @pytest.mark.parametrize("fast_redundancy", [True, False]) -def test_mrmr_with_categorical_regressor(df_with_regressors, fast_redundancy): +def test_mrmr_with_castable_categorical_regressor(df_with_regressors, fast_redundancy): df, regressors = df_with_regressors["df"], df_with_regressors["regressors"] relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor()) mrmr(relevance_table=relevance_table, regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy)