Skip to content

Fix mrmr working with categoricals #1311

Merged
merged 4 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
-
-
-
- `mrmr` feature selection working with categoricals ([#1311](https://github.com/tinkoff-ai/etna/pull/1311))
-

### Removed
Expand Down
6 changes: 6 additions & 0 deletions etna/analysis/feature_selection/mrmr_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ def mrmr(
relevance_aggregation_fn = AGGREGATION_FN[AggregationMode(relevance_aggregation_mode)]
redundancy_aggregation_fn = AGGREGATION_FN[AggregationMode(redundancy_aggregation_mode)]

# can't compute correlation of categorical column with the others
try:
regressors = regressors.astype(float)
except ValueError as e:
raise ValueError(f"Only convertible to float features are allowed! Error: {str(e)}")

relevance = relevance_table.apply(relevance_aggregation_fn).fillna(0)

all_features = relevance.index.to_list()
Expand Down
35 changes: 35 additions & 0 deletions tests/test_analysis/conftest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,43 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytest

from etna.datasets import TSDataset
from etna.datasets import duplicate_data


@pytest.fixture(autouse=True)
def close_plots():
yield
plt.close()


@pytest.fixture
def exog_and_target_dfs():
seg = ["a"] * 30 + ["b"] * 30
time = list(pd.date_range("2020-01-01", "2021-01-01")[:30])
timestamps = time * 2
target = np.arange(60)
df = pd.DataFrame({"segment": seg, "timestamp": timestamps, "target": target})
ts = TSDataset.to_dataset(df)

cast = ["1.1"] * 10 + ["2"] * 9 + [None] + ["56.1"] * 10
no_cast = ["1.1"] * 10 + ["two"] * 10 + ["56.1"] * 10
none = [1] * 10 + [2] * 10 + [56.1] * 10
none[10] = None
df = pd.DataFrame(
{
"timestamp": time,
"exog1": np.arange(100, 70, -1),
"exog2": np.sin(np.arange(30) / 10),
"exog3": np.exp(np.arange(30)),
"cast": cast,
"no_cast": no_cast,
"none": none,
}
)
df["cast"] = df["cast"].astype("category")
df["no_cast"] = df["no_cast"].astype("category")
df_exog = duplicate_data(df, segments=["a", "b"])
return ts, df_exog
30 changes: 0 additions & 30 deletions tests/test_analysis/test_feature_relevance/test_relevance_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,36 +38,6 @@ def test_model_relevance_table(simple_df_relevance):
assert relevance_table["regressor_1"]["2"] < relevance_table["regressor_2"]["2"]


@pytest.fixture()
def exog_and_target_dfs():
seg = ["a"] * 30 + ["b"] * 30
time = list(pd.date_range("2020-01-01", "2021-01-01")[:30])
timestamps = time * 2
target = np.arange(60)
df = pd.DataFrame({"segment": seg, "timestamp": timestamps, "target": target})
ts = TSDataset.to_dataset(df)

cast = ["1.1"] * 10 + ["2"] * 9 + [None] + ["56.1"] * 10
no_cast = ["1.1"] * 10 + ["two"] * 10 + ["56.1"] * 10
none = [1] * 10 + [2] * 10 + [56.1] * 10
none[10] = None
df = pd.DataFrame(
{
"timestamp": time,
"exog1": np.arange(100, 70, -1),
"exog2": np.sin(np.arange(30) / 10),
"exog3": np.exp(np.arange(30)),
"cast": cast,
"no_cast": no_cast,
"none": none,
}
)
df["cast"] = df["cast"].astype("category")
df["no_cast"] = df["no_cast"].astype("category")
df_exog = duplicate_data(df, segments=["a", "b"])
return ts, df_exog


@pytest.fixture()
def exog_and_target_dfs_with_none():
seg = ["a"] * 30 + ["b"] * 30
Expand Down
21 changes: 21 additions & 0 deletions tests/test_analysis/test_feature_selection/test_mrmr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Dict
from unittest.mock import Mock

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -32,6 +33,12 @@ def df_with_regressors() -> Dict[str, pd.DataFrame]:
regressor = df_regressors_useless[df_regressors_useless["segment"] == segment]["target"].values
df_exog[f"regressor_useless_{i}"] = regressor

# useless categorical regressor
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

regressor -> regressors?

num_cat_useless = 3
for i in range(num_cat_useless):
df_exog[f"categorical_regressor_useless_{i}"] = i
df_exog[f"categorical_regressor_useless_{i}"] = df_exog[f"categorical_regressor_useless_{i}"].astype("category")

# useful regressors: the same as target but with little noise
df_regressors_useful = df.copy()
sampler = RandomState(seed=2).normal
Expand Down Expand Up @@ -174,3 +181,17 @@ def test_fast_redundancy_deprecation_warning(df_with_regressors):
relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
with pytest.warns(DeprecationWarning, match="Option `fast_redundancy=False` was added for backward compatibility"):
mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=False)


@pytest.mark.parametrize("fast_redundancy", [True, False])
def test_mrmr_with_categorical_regressor(df_with_regressors, fast_redundancy):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test_mrmr_with_categorical_regressor -> test_mrmr_with_castable_categorical_regressor?

df, regressors = df_with_regressors["df"], df_with_regressors["regressors"]
relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
mrmr(relevance_table=relevance_table, regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy)


@pytest.mark.parametrize("fast_redundancy", [True, False])
def test_mrmr_with_uncastable_categorical_regressor_fails(exog_and_target_dfs, fast_redundancy):
df, regressors = exog_and_target_dfs
with pytest.raises(ValueError, match="Only convertible to float features are allowed!"):
mrmr(relevance_table=Mock(), regressors=regressors, top_k=len(regressors), fast_redundancy=fast_redundancy)