tinkoff-ai · Mr-Geekman · Jul 10, 2023 · Jul 10, 2023 · Jul 10, 2023 · Jul 10, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,7 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - 
 - 
 - 
-- 
+- `mrmr` feature selection working with categoricals ([#1311](https://github.com/tinkoff-ai/etna/pull/1311))
 - 
 
 ### Removed

diff --git a/etna/analysis/feature_selection/mrmr_selection.py b/etna/analysis/feature_selection/mrmr_selection.py
@@ -72,6 +72,12 @@ def mrmr(
     relevance_aggregation_fn = AGGREGATION_FN[AggregationMode(relevance_aggregation_mode)]
     redundancy_aggregation_fn = AGGREGATION_FN[AggregationMode(redundancy_aggregation_mode)]
 
+    # can't compute correlation of categorical column with the others
+    try:
+        regressors = regressors.astype(float)
+    except ValueError as e:
+        raise ValueError(f"Only convertible to float features are allowed! Error: {str(e)}")
+
     relevance = relevance_table.apply(relevance_aggregation_fn).fillna(0)
 
     all_features = relevance.index.to_list()

diff --git a/tests/test_analysis/conftest.py b/tests/test_analysis/conftest.py
@@ -1,8 +1,43 @@
 import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
 import pytest
 
+from etna.datasets import TSDataset
+from etna.datasets import duplicate_data
+
 
 @pytest.fixture(autouse=True)
 def close_plots():
     yield
     plt.close()
+
+
+@pytest.fixture
+def exog_and_target_dfs():
+    seg = ["a"] * 30 + ["b"] * 30
+    time = list(pd.date_range("2020-01-01", "2021-01-01")[:30])
+    timestamps = time * 2
+    target = np.arange(60)
+    df = pd.DataFrame({"segment": seg, "timestamp": timestamps, "target": target})
+    ts = TSDataset.to_dataset(df)
+
+    cast = ["1.1"] * 10 + ["2"] * 9 + [None] + ["56.1"] * 10
+    no_cast = ["1.1"] * 10 + ["two"] * 10 + ["56.1"] * 10
+    none = [1] * 10 + [2] * 10 + [56.1] * 10
+    none[10] = None
+    df = pd.DataFrame(
+        {
+            "timestamp": time,
+            "exog1": np.arange(100, 70, -1),
+            "exog2": np.sin(np.arange(30) / 10),
+            "exog3": np.exp(np.arange(30)),
+            "cast": cast,
+            "no_cast": no_cast,
+            "none": none,
+        }
+    )
+    df["cast"] = df["cast"].astype("category")
+    df["no_cast"] = df["no_cast"].astype("category")
+    df_exog = duplicate_data(df, segments=["a", "b"])
+    return ts, df_exog
diff --git a/tests/test_analysis/test_feature_relevance/test_relevance_table.py b/tests/test_analysis/test_feature_relevance/test_relevance_table.py
@@ -38,36 +38,6 @@ def test_model_relevance_table(simple_df_relevance):
     assert relevance_table["regressor_1"]["2"] < relevance_table["regressor_2"]["2"]
 
 
-@pytest.fixture()
-def exog_and_target_dfs():
-    seg = ["a"] * 30 + ["b"] * 30
-    time = list(pd.date_range("2020-01-01", "2021-01-01")[:30])
-    timestamps = time * 2
-    target = np.arange(60)
-    df = pd.DataFrame({"segment": seg, "timestamp": timestamps, "target": target})
-    ts = TSDataset.to_dataset(df)
-
-    cast = ["1.1"] * 10 + ["2"] * 9 + [None] + ["56.1"] * 10
-    no_cast = ["1.1"] * 10 + ["two"] * 10 + ["56.1"] * 10
-    none = [1] * 10 + [2] * 10 + [56.1] * 10
-    none[10] = None
-    df = pd.DataFrame(
-        {
-            "timestamp": time,
-            "exog1": np.arange(100, 70, -1),
-            "exog2": np.sin(np.arange(30) / 10),
-            "exog3": np.exp(np.arange(30)),
-            "cast": cast,
-            "no_cast": no_cast,
-            "none": none,
-        }
-    )
-    df["cast"] = df["cast"].astype("category")
-    df["no_cast"] = df["no_cast"].astype("category")
-    df_exog = duplicate_data(df, segments=["a", "b"])
-    return ts, df_exog
-
-
 @pytest.fixture()
 def exog_and_target_dfs_with_none():
     seg = ["a"] * 30 + ["b"] * 30

diff --git a/tests/test_analysis/test_feature_selection/test_mrmr.py b/tests/test_analysis/test_feature_selection/test_mrmr.py
@@ -1,4 +1,5 @@
 from typing import Dict
+from unittest.mock import Mock
 
 import numpy as np
 import pandas as pd
@@ -32,6 +33,12 @@ def df_with_regressors() -> Dict[str, pd.DataFrame]:
         regressor = df_regressors_useless[df_regressors_useless["segment"] == segment]["target"].values
         df_exog[f"regressor_useless_{i}"] = regressor
 
+    # useless categorical regressor
+    num_cat_useless = 3
+    for i in range(num_cat_useless):
+        df_exog[f"categorical_regressor_useless_{i}"] = i
+        df_exog[f"categorical_regressor_useless_{i}"] = df_exog[f"categorical_regressor_useless_{i}"].astype("category")
+
     # useful regressors: the same as target but with little noise
     df_regressors_useful = df.copy()
     sampler = RandomState(seed=2).normal
@@ -174,3 +181,17 @@ def test_fast_redundancy_deprecation_warning(df_with_regressors):
     relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
     with pytest.warns(DeprecationWarning, match="Option `fast_redundancy=False` was added for backward compatibility"):
         mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=False)
+
+
+@pytest.mark.parametrize("fast_redundancy", [True, False])
+def test_mrmr_with_categorical_regressor(df_with_regressors, fast_redundancy):
+    df, regressors = df_with_regressors["df"], df_with_regressors["regressors"]
+    relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
+    mrmr(relevance_table=relevance_table, regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy)
+
+
+@pytest.mark.parametrize("fast_redundancy", [True, False])
+def test_mrmr_with_uncastable_categorical_regressor_fails(exog_and_target_dfs, fast_redundancy):
+    df, regressors = exog_and_target_dfs
+    with pytest.raises(ValueError, match="Only convertible to float features are allowed!"):
+        mrmr(relevance_table=Mock(), regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy)
-Original file line number
+Diff line change
@@ Expand Up @@
     -
     -
     -
-    -
+    - `mrmr` feature selection working with categoricals ([#1311](https://github.com/tinkoff-ai/etna/pull/1311))
     -
     ### Removed
@@ Expand Down @@