Skip to content

Fix mrmr working with categoricals #1311

Merged
merged 4 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
-
-
-
- `mrmr` feature selection working with categoricals ([#1311](https://github.com/tinkoff-ai/etna/pull/1311))
-

### Removed
Expand Down
6 changes: 6 additions & 0 deletions etna/analysis/feature_selection/mrmr_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ def mrmr(
relevance_aggregation_fn = AGGREGATION_FN[AggregationMode(relevance_aggregation_mode)]
redundancy_aggregation_fn = AGGREGATION_FN[AggregationMode(redundancy_aggregation_mode)]

# can't compute correlation of categorical column with the others
try:
regressors = regressors.astype(float)
except ValueError as e:
raise ValueError(f"Only convertible to float features are allowed! Error: {str(e)}")

relevance = relevance_table.apply(relevance_aggregation_fn).fillna(0)

all_features = relevance.index.to_list()
Expand Down
35 changes: 35 additions & 0 deletions tests/test_analysis/conftest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,43 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytest

from etna.datasets import TSDataset
from etna.datasets import duplicate_data


@pytest.fixture(autouse=True)
def close_plots():
yield
plt.close()


@pytest.fixture
def exog_and_target_dfs():
seg = ["a"] * 30 + ["b"] * 30
time = list(pd.date_range("2020-01-01", "2021-01-01")[:30])
timestamps = time * 2
target = np.arange(60)
df = pd.DataFrame({"segment": seg, "timestamp": timestamps, "target": target})
ts = TSDataset.to_dataset(df)

cast = ["1.1"] * 10 + ["2"] * 9 + [None] + ["56.1"] * 10
no_cast = ["1.1"] * 10 + ["two"] * 10 + ["56.1"] * 10
none = [1] * 10 + [2] * 10 + [56.1] * 10
none[10] = None
df = pd.DataFrame(
{
"timestamp": time,
"exog1": np.arange(100, 70, -1),
"exog2": np.sin(np.arange(30) / 10),
"exog3": np.exp(np.arange(30)),
"cast": cast,
"no_cast": no_cast,
"none": none,
}
)
df["cast"] = df["cast"].astype("category")
df["no_cast"] = df["no_cast"].astype("category")
df_exog = duplicate_data(df, segments=["a", "b"])
return ts, df_exog
30 changes: 0 additions & 30 deletions tests/test_analysis/test_feature_relevance/test_relevance_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,36 +38,6 @@ def test_model_relevance_table(simple_df_relevance):
assert relevance_table["regressor_1"]["2"] < relevance_table["regressor_2"]["2"]


@pytest.fixture()
def exog_and_target_dfs():
seg = ["a"] * 30 + ["b"] * 30
time = list(pd.date_range("2020-01-01", "2021-01-01")[:30])
timestamps = time * 2
target = np.arange(60)
df = pd.DataFrame({"segment": seg, "timestamp": timestamps, "target": target})
ts = TSDataset.to_dataset(df)

cast = ["1.1"] * 10 + ["2"] * 9 + [None] + ["56.1"] * 10
no_cast = ["1.1"] * 10 + ["two"] * 10 + ["56.1"] * 10
none = [1] * 10 + [2] * 10 + [56.1] * 10
none[10] = None
df = pd.DataFrame(
{
"timestamp": time,
"exog1": np.arange(100, 70, -1),
"exog2": np.sin(np.arange(30) / 10),
"exog3": np.exp(np.arange(30)),
"cast": cast,
"no_cast": no_cast,
"none": none,
}
)
df["cast"] = df["cast"].astype("category")
df["no_cast"] = df["no_cast"].astype("category")
df_exog = duplicate_data(df, segments=["a", "b"])
return ts, df_exog


@pytest.fixture()
def exog_and_target_dfs_with_none():
seg = ["a"] * 30 + ["b"] * 30
Expand Down
21 changes: 21 additions & 0 deletions tests/test_analysis/test_feature_selection/test_mrmr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Dict
from unittest.mock import Mock

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -32,6 +33,12 @@ def df_with_regressors() -> Dict[str, pd.DataFrame]:
regressor = df_regressors_useless[df_regressors_useless["segment"] == segment]["target"].values
df_exog[f"regressor_useless_{i}"] = regressor

# useless categorical regressors
num_cat_useless = 3
for i in range(num_cat_useless):
df_exog[f"categorical_regressor_useless_{i}"] = i
df_exog[f"categorical_regressor_useless_{i}"] = df_exog[f"categorical_regressor_useless_{i}"].astype("category")

# useful regressors: the same as target but with little noise
df_regressors_useful = df.copy()
sampler = RandomState(seed=2).normal
Expand Down Expand Up @@ -174,3 +181,17 @@ def test_fast_redundancy_deprecation_warning(df_with_regressors):
relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
with pytest.warns(DeprecationWarning, match="Option `fast_redundancy=False` was added for backward compatibility"):
mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=False)


@pytest.mark.parametrize("fast_redundancy", [True, False])
def test_mrmr_with_castable_categorical_regressor(df_with_regressors, fast_redundancy):
df, regressors = df_with_regressors["df"], df_with_regressors["regressors"]
relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
mrmr(relevance_table=relevance_table, regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy)


@pytest.mark.parametrize("fast_redundancy", [True, False])
def test_mrmr_with_uncastable_categorical_regressor_fails(exog_and_target_dfs, fast_redundancy):
df, regressors = exog_and_target_dfs
with pytest.raises(ValueError, match="Only convertible to float features are allowed!"):
mrmr(relevance_table=Mock(), regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy)