From 66c9263e15dcb51ff6ee773ff67ebc2c2ad0913d Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Thu, 27 Apr 2023 12:38:34 +0300 Subject: [PATCH 1/7] feature: add default params_to_tune for feature selection transforms --- .../feature_selection/feature_importance.py | 67 +++++++++++++++++-- .../feature_selection/gale_shapley.py | 23 +++++++ .../test_feature_importance_transform.py | 32 +++++++++ .../test_gale_shapley_transform.py | 16 +++++ 4 files changed, 132 insertions(+), 6 deletions(-) diff --git a/etna/transforms/feature_selection/feature_importance.py b/etna/transforms/feature_selection/feature_importance.py index a1470a613..c59aaacfb 100644 --- a/etna/transforms/feature_selection/feature_importance.py +++ b/etna/transforms/feature_selection/feature_importance.py @@ -14,12 +14,19 @@ from sklearn.tree import ExtraTreeRegressor from typing_extensions import Literal +from etna import SETTINGS from etna.analysis import RelevanceTable from etna.analysis.feature_selection.mrmr_selection import AggregationMode from etna.analysis.feature_selection.mrmr_selection import mrmr from etna.datasets import TSDataset from etna.transforms.feature_selection import BaseFeatureSelectionTransform +if SETTINGS.auto_required: + from optuna.distributions import BaseDistribution + from optuna.distributions import CategoricalDistribution + from optuna.distributions import IntUniformDistribution + + TreeBasedRegressor = Union[ DecisionTreeRegressor, ExtraTreeRegressor, @@ -41,7 +48,7 @@ class TreeFeatureSelectionTransform(BaseFeatureSelectionTransform): def __init__( self, - model: TreeBasedRegressor, + model: Union[Literal["catboost"], Literal["random_forest"], TreeBasedRegressor], top_k: int, features_to_use: Union[List[str], Literal["all"]] = "all", return_features: bool = False, @@ -52,8 +59,14 @@ def __init__( Parameters ---------- model: - model to make selection, it should have ``feature_importances_`` property - (e.g. all tree-based regressors in sklearn) + Model to make selection, it should have ``feature_importances_`` property + (e.g. all tree-based regressors in sklearn). + Pre-defined options are also available: + + * catboost: ``catboost.CatBoostRegressor(silent=True)``, this model won't work if there are any category types + + * random_forest: ``sklearn.ensemble.RandomForestRegressor(random_state=0)`` + top_k: num of features to select; if there are not enough features, then all will be selected features_to_use: @@ -64,8 +77,16 @@ def __init__( if not isinstance(top_k, int) or top_k < 0: raise ValueError("Parameter top_k should be positive integer") super().__init__(features_to_use=features_to_use, return_features=return_features) - self.model = model self.top_k = top_k + if isinstance(model, str): + if model == "catboost": + self.model = CatBoostRegressor(silent=True) + elif model == "random_forest": + self.model = RandomForestRegressor(random_state=0) + else: + raise ValueError(f"Not a valid option for model: {model}") + else: + self.model = model def _get_train(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: """Get train data for model.""" @@ -102,7 +123,7 @@ def _fit(self, df: pd.DataFrame) -> "TreeFeatureSelectionTransform": Returns ------- - result: TreeFeatureSelectionTransform + result: instance after fitting """ if len(self._get_features_to_use(df)) == 0: @@ -112,6 +133,24 @@ def _fit(self, df: pd.DataFrame) -> "TreeFeatureSelectionTransform": self.selected_features = self._select_top_k_features(weights, self.top_k) return self + def params_to_tune(self) -> Dict[str, "BaseDistribution"]: + """Get default grid for tuning hyperparameters. + + This grid tunes parameters: ``model``, ``top_k``. Other parameters are expected to be set by the user. + + For ``model`` parameter only pre-defined options are suggested. + For ``top_k`` parameter the maximum suggested value is not greater than ``self.top_k``. + + Returns + ------- + : + Grid to tune. + """ + return { + "model": CategoricalDistribution(["catboost", "random_forest"]), + "top_k": IntUniformDistribution(low=1, high=self.top_k), + } + class MRMRFeatureSelectionTransform(BaseFeatureSelectionTransform): """Transform that selects features according to MRMR variable selection method adapted to the timeseries case. @@ -176,7 +215,7 @@ def _fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform": Returns ------- - result: MRMRFeatureSelectionTransform + result: instance after fitting """ features = self._get_features_to_use(df) @@ -193,3 +232,19 @@ def _fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform": atol=self.atol, ) return self + + def params_to_tune(self) -> Dict[str, "BaseDistribution"]: + """Get default grid for tuning hyperparameters. + + This grid tunes only ``top_k`` parameter. Other parameters are expected to be set by the user. + + For ``top_k`` parameter the maximum suggested value is not greater than ``self.top_k``. + + Returns + ------- + : + Grid to tune. + """ + return { + "top_k": IntUniformDistribution(low=1, high=self.top_k), + } diff --git a/etna/transforms/feature_selection/gale_shapley.py b/etna/transforms/feature_selection/gale_shapley.py index b1e452f05..b57aa72ab 100644 --- a/etna/transforms/feature_selection/gale_shapley.py +++ b/etna/transforms/feature_selection/gale_shapley.py @@ -8,10 +8,16 @@ import pandas as pd from typing_extensions import Literal +from etna import SETTINGS from etna.analysis import RelevanceTable from etna.core import BaseMixin from etna.transforms.feature_selection.base import BaseFeatureSelectionTransform +if SETTINGS.auto_required: + from optuna.distributions import BaseDistribution + from optuna.distributions import CategoricalDistribution + from optuna.distributions import IntUniformDistribution + class BaseGaleShapley(BaseMixin): """Base class for a member of Gale-Shapley matching.""" @@ -385,3 +391,20 @@ def _fit(self, df: pd.DataFrame) -> "GaleShapleyFeatureSelectionTransform": segment_features_ranking=segment_features_ranking, features_to_drop=selected_features ) return self + + def params_to_tune(self) -> Dict[str, "BaseDistribution"]: + """Get default grid for tuning hyperparameters. + + This grid tunes parameters: ``top_k``, ``use_rank``. Other parameters are expected to be set by the user. + + For ``top_k`` parameter the maximum suggested value is not greater than ``self.top_k``. + + Returns + ------- + : + Grid to tune. + """ + return { + "top_k": IntUniformDistribution(low=1, high=self.top_k), + "use_rank": CategoricalDistribution([False, True]), + } diff --git a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py index 5ab511e13..9d1f65aec 100644 --- a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py +++ b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py @@ -18,6 +18,7 @@ from etna.transforms import SegmentEncoderTransform from etna.transforms.feature_selection import TreeFeatureSelectionTransform from etna.transforms.feature_selection.feature_importance import MRMRFeatureSelectionTransform +from tests.test_transforms.utils import assert_sampling_is_valid from tests.test_transforms.utils import assert_transformation_equals_loaded_original @@ -67,9 +68,16 @@ def ts_with_regressors(): ) +def test_create_with_unknown_model(ts_with_exog): + with pytest.raises(ValueError, match="Not a valid option for model: .*"): + _ = TreeFeatureSelectionTransform(model="unknown", top_k=3, features_to_use="all") + + @pytest.mark.parametrize( "model", [ + "random_forest", + "catboost", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -86,6 +94,7 @@ def test_work_with_non_regressors(ts_with_exog, model): @pytest.mark.parametrize( "model", [ + "random_forest", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -113,6 +122,7 @@ def test_selected_top_k_regressors(model, top_k, ts_with_regressors): @pytest.mark.parametrize( "model", [ + "random_forest", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -141,6 +151,7 @@ def test_retain_values(model, top_k, ts_with_regressors): @pytest.mark.parametrize( "model", [ + "random_forest", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -158,6 +169,8 @@ def test_fails_negative_top_k(model): @pytest.mark.parametrize( "model", [ + "random_forest", + "catboost", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -178,6 +191,7 @@ def test_warns_no_regressors(model, example_tsds): @pytest.mark.parametrize( "model", [ + "random_forest", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -202,6 +216,7 @@ def test_sanity_selected(model, ts_with_regressors): @pytest.mark.parametrize( "model", [ + "random_forest", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -231,6 +246,7 @@ def test_sanity_model(model, ts_with_regressors): @pytest.mark.parametrize( "model", [ + "random_forest", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -286,3 +302,19 @@ def test_mrmr_right_regressors(relevance_table, ts_with_regressors): ) def test_save_load(transform, ts_with_regressors): assert_transformation_equals_loaded_original(transform=transform, ts=ts_with_regressors) + + +@pytest.mark.parametrize( + "transform", + [ + TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=3), + MRMRFeatureSelectionTransform( + relevance_table=ModelRelevanceTable(), top_k=3, model=RandomForestRegressor(random_state=42) + ), + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3), + ], +) +def test_params_to_tune(transform, ts_with_regressors): + ts = ts_with_regressors + assert len(transform.params_to_tune()) > 0 + assert_sampling_is_valid(transform=transform, ts=ts) diff --git a/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py b/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py index 37cc00ff8..87322e356 100644 --- a/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py +++ b/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py @@ -16,6 +16,7 @@ from etna.transforms.feature_selection.gale_shapley import FeatureGaleShapley from etna.transforms.feature_selection.gale_shapley import GaleShapleyMatcher from etna.transforms.feature_selection.gale_shapley import SegmentGaleShapley +from tests.test_transforms.utils import assert_sampling_is_valid from tests.test_transforms.utils import assert_transformation_equals_loaded_original @@ -659,3 +660,18 @@ def test_right_number_features_with_integer_division(ts_with_exog_galeshapley): remaining_columns = ts.columns.get_level_values("feature").unique().tolist() assert len(remaining_columns) == top_k + 1 + + +@pytest.mark.parametrize( + "transform", + [ + GaleShapleyFeatureSelectionTransform( + relevance_table=ModelRelevanceTable(), top_k=3, use_rank=False, model=RandomForestRegressor(random_state=42) + ), + GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, use_rank=False), + ], +) +def test_params_to_tune(transform, ts_with_large_regressors_number): + ts = ts_with_large_regressors_number + assert len(transform.params_to_tune()) > 0 + assert_sampling_is_valid(transform=transform, ts=ts) From e5c27c863f25810ea2ee082950d1130571c2b89e Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Thu, 27 Apr 2023 12:40:51 +0300 Subject: [PATCH 2/7] chore: update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a2539f434..21f1080a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,6 +56,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add default `params_to_tune` for `TimeSeriesImputerTransform` ([#1232](https://github.com/tinkoff-ai/etna/pull/1232)) - Add default `params_to_tune` for `DifferencingTransform`, `MedianTransform`, `MaxTransform`, `MinTransform`, `QuantileTransform`, `StdTransform`, `MeanTransform`, `MADTransform`, `MinMaxDifferenceTransform`, `SumTransform`, `BoxCoxTransform`, `YeoJohnsonTransform`, `MaxAbsScalerTransform`, `MinMaxScalerTransform`, `RobustScalerTransform` and `StandardScalerTransform` ([#1233](https://github.com/tinkoff-ai/etna/pull/1233)) - Add default `params_to_tune` for `LabelEncoderTransform` ([#1242](https://github.com/tinkoff-ai/etna/pull/1242)) +- Add default `params_to_tune` for `TreeFeatureSelectionTransform`, `MRMRFeatureSelectionTransform` and `GaleShapleyFeatureSelectionTransform` ([#1250](https://github.com/tinkoff-ai/etna/pull/1250)) ### Fixed - Fix bug in `GaleShapleyFeatureSelectionTransform` with wrong number of remaining features ([#1110](https://github.com/tinkoff-ai/etna/pull/1110)) - `ProphetModel` fails with additional seasonality set ([#1157](https://github.com/tinkoff-ai/etna/pull/1157)) From a55ab0967293495d53e05e9b75430edae15b39ac Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Thu, 27 Apr 2023 12:43:36 +0300 Subject: [PATCH 3/7] test: add test on params_to_tune for FilterFeaturesTransform --- .../test_feature_selection/test_filter_transform.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_transforms/test_feature_selection/test_filter_transform.py b/tests/test_transforms/test_feature_selection/test_filter_transform.py index 5f1adcdf6..42ae4e211 100644 --- a/tests/test_transforms/test_feature_selection/test_filter_transform.py +++ b/tests/test_transforms/test_feature_selection/test_filter_transform.py @@ -201,3 +201,16 @@ def test_inverse_transform_back_included_columns(ts_with_features, columns, retu ) def test_save_load(transform, ts_with_features): assert_transformation_equals_loaded_original(transform=transform, ts=ts_with_features) + + +@pytest.mark.parametrize( + "transform", + [ + FilterFeaturesTransform(include=["target"], return_features=True), + FilterFeaturesTransform(include=["target"], return_features=False), + FilterFeaturesTransform(exclude=["exog_1", "exog_2"], return_features=False), + FilterFeaturesTransform(exclude=["exog_1", "exog_2"], return_features=False), + ], +) +def test_params_to_tune(transform): + assert len(transform.params_to_tune()) == 0 From 30b8c6182408e50dc9408733abd7eb3694a2401c Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Fri, 28 Apr 2023 11:04:23 +0300 Subject: [PATCH 4/7] docs: add number of trees in documentation for TreeFeatureSelectionTransform --- etna/transforms/feature_selection/feature_importance.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/etna/transforms/feature_selection/feature_importance.py b/etna/transforms/feature_selection/feature_importance.py index c59aaacfb..148bb0177 100644 --- a/etna/transforms/feature_selection/feature_importance.py +++ b/etna/transforms/feature_selection/feature_importance.py @@ -63,9 +63,10 @@ def __init__( (e.g. all tree-based regressors in sklearn). Pre-defined options are also available: - * catboost: ``catboost.CatBoostRegressor(silent=True)``, this model won't work if there are any category types + * catboost: ``catboost.CatBoostRegressor(iterations=1000, silent=True)``, + this model won't work if there are any category types - * random_forest: ``sklearn.ensemble.RandomForestRegressor(random_state=0)`` + * random_forest: ``sklearn.ensemble.RandomForestRegressor(n_estimators=100, random_state=0)`` top_k: num of features to select; if there are not enough features, then all will be selected @@ -80,7 +81,7 @@ def __init__( self.top_k = top_k if isinstance(model, str): if model == "catboost": - self.model = CatBoostRegressor(silent=True) + self.model = CatBoostRegressor(iterations=1000, silent=True) elif model == "random_forest": self.model = RandomForestRegressor(random_state=0) else: From 579601cb28b49d5b3a339eaadcd6e5b08e3eee82 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Fri, 28 Apr 2023 12:08:01 +0300 Subject: [PATCH 5/7] fix: fix behavior with category dtype for catboost model in TreeFeatureSelectionTransform --- .../feature_selection/feature_importance.py | 16 ++++++++++++---- .../test_feature_importance_transform.py | 6 ++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/etna/transforms/feature_selection/feature_importance.py b/etna/transforms/feature_selection/feature_importance.py index 148bb0177..da2b83e50 100644 --- a/etna/transforms/feature_selection/feature_importance.py +++ b/etna/transforms/feature_selection/feature_importance.py @@ -61,12 +61,15 @@ def __init__( model: Model to make selection, it should have ``feature_importances_`` property (e.g. all tree-based regressors in sklearn). + + If ``catboost.CatBoostRegressor`` is given with no ``cat_features`` parameter, + then ``cat_features`` are set during ``fit`` to be equal to columns of category type. + Pre-defined options are also available: - * catboost: ``catboost.CatBoostRegressor(iterations=1000, silent=True)``, - this model won't work if there are any category types + * catboost: ``catboost.CatBoostRegressor(iterations=1000, silent=True)``; - * random_forest: ``sklearn.ensemble.RandomForestRegressor(n_estimators=100, random_state=0)`` + * random_forest: ``sklearn.ensemble.RandomForestRegressor(n_estimators=100, random_state=0)``. top_k: num of features to select; if there are not enough features, then all will be selected @@ -100,7 +103,12 @@ def _get_train(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: def _get_features_weights(self, df: pd.DataFrame) -> Dict[str, float]: """Get weights for features based on model feature importances.""" train_data, train_target = self._get_train(df) - self.model.fit(train_data, train_target) + if isinstance(self.model, CatBoostRegressor) and self.model.get_param("cat_features") is None: + dtypes = train_data.dtypes + cat_features = dtypes[dtypes == "category"].index.tolist() + self.model.fit(train_data, train_target, cat_features=cat_features) + else: + self.model.fit(train_data, train_target) weights_array = self.model.feature_importances_ weights_dict = {column: weights_array[i] for i, column in enumerate(train_data.columns)} return weights_dict diff --git a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py index 9d1f65aec..337ef1113 100644 --- a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py +++ b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py @@ -95,6 +95,7 @@ def test_work_with_non_regressors(ts_with_exog, model): "model", [ "random_forest", + "catboost", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -123,6 +124,7 @@ def test_selected_top_k_regressors(model, top_k, ts_with_regressors): "model", [ "random_forest", + "catboost", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -152,6 +154,7 @@ def test_retain_values(model, top_k, ts_with_regressors): "model", [ "random_forest", + "catboost", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -192,6 +195,7 @@ def test_warns_no_regressors(model, example_tsds): "model", [ "random_forest", + "catboost", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -217,6 +221,7 @@ def test_sanity_selected(model, ts_with_regressors): "model", [ "random_forest", + "catboost", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), @@ -247,6 +252,7 @@ def test_sanity_model(model, ts_with_regressors): "model", [ "random_forest", + "catboost", DecisionTreeRegressor(random_state=42), ExtraTreeRegressor(random_state=42), RandomForestRegressor(n_estimators=10, random_state=42), From 71558ba87ef49862f85818c231ec659d3b2cad62 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Fri, 28 Apr 2023 19:01:51 +0300 Subject: [PATCH 6/7] test: add new fixture, add test on catboost models --- .../test_feature_importance_transform.py | 45 ++++++++++++------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py index 337ef1113..a4e436839 100644 --- a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py +++ b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py @@ -68,11 +68,32 @@ def ts_with_regressors(): ) +@pytest.fixture +def ts_with_regressors_and_features(ts_with_regressors): + le_encoder = SegmentEncoderTransform() + le_encoder.fit_transform(ts_with_regressors) + return ts_with_regressors + + def test_create_with_unknown_model(ts_with_exog): with pytest.raises(ValueError, match="Not a valid option for model: .*"): _ = TreeFeatureSelectionTransform(model="unknown", top_k=3, features_to_use="all") +@pytest.mark.parametrize( + "model", + [ + "catboost", + CatBoostRegressor(iterations=10, random_state=42, silent=True), + CatBoostRegressor(iterations=10, random_state=42, silent=True, cat_features=["segment_code"]), + ], +) +def test_catboost_with_cat_features(model, ts_with_regressors_and_features): + """Check that transform with catboost model can work with cat features in a dataset.""" + selector = TreeFeatureSelectionTransform(model=model, top_k=3, features_to_use="all") + selector.fit_transform(ts_with_regressors_and_features) + + @pytest.mark.parametrize( "model", [ @@ -105,14 +126,10 @@ def test_work_with_non_regressors(ts_with_exog, model): ], ) @pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50]) -def test_selected_top_k_regressors(model, top_k, ts_with_regressors): +def test_selected_top_k_regressors(model, top_k, ts_with_regressors_and_features): """Check that transform selects exactly top_k regressors if where are this much.""" - all_regressors = ts_with_regressors.regressors - all_regressors.append("segment_code") - - ts = ts_with_regressors - le_encoder = SegmentEncoderTransform() - le_encoder.fit_transform(ts) + ts = ts_with_regressors_and_features + all_regressors = ts_with_regressors_and_features.regressors selector = TreeFeatureSelectionTransform(model=model, top_k=top_k) selector.fit_transform(ts) @@ -134,16 +151,14 @@ def test_selected_top_k_regressors(model, top_k, ts_with_regressors): ], ) @pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50]) -def test_retain_values(model, top_k, ts_with_regressors): +def test_retain_values(model, top_k, ts_with_regressors_and_features): """Check that transform doesn't change values of columns.""" - ts = ts_with_regressors - le_encoder = SegmentEncoderTransform() - le_encoder.fit_transform(ts) + ts = ts_with_regressors_and_features df_encoded = ts.to_pandas() selector = TreeFeatureSelectionTransform(model=model, top_k=top_k) df_selected = selector.fit_transform(ts).to_pandas() - for segment in ts_with_regressors.segments: + for segment in ts.segments: for column in df_selected.columns.get_level_values("feature").unique(): assert ( df_selected.loc[:, pd.IndexSlice[segment, column]] == df_encoded.loc[:, pd.IndexSlice[segment, column]] @@ -204,11 +219,9 @@ def test_warns_no_regressors(model, example_tsds): CatBoostRegressor(iterations=700, random_state=42, silent=True, cat_features=["segment_code"]), ], ) -def test_sanity_selected(model, ts_with_regressors): +def test_sanity_selected(model, ts_with_regressors_and_features): """Check that transform correctly finds meaningful regressors.""" - ts = ts_with_regressors - le_encoder = SegmentEncoderTransform() - le_encoder.fit_transform(ts) + ts = ts_with_regressors_and_features selector = TreeFeatureSelectionTransform(model=model, top_k=8) df_selected = selector.fit_transform(ts).to_pandas() features_columns = df_selected.columns.get_level_values("feature").unique() From 48638b74fc8bf5ffcd066227cc5585ced2735596 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Fri, 28 Apr 2023 19:11:39 +0300 Subject: [PATCH 7/7] test: simplify tests --- .../test_feature_importance_transform.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py index a4e436839..56fe0ba84 100644 --- a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py +++ b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py @@ -122,14 +122,14 @@ def test_work_with_non_regressors(ts_with_exog, model): RandomForestRegressor(n_estimators=10, random_state=42), ExtraTreesRegressor(n_estimators=10, random_state=42), GradientBoostingRegressor(n_estimators=10, random_state=42), - CatBoostRegressor(iterations=10, random_state=42, silent=True, cat_features=["segment_code"]), + CatBoostRegressor(iterations=10, random_state=42, silent=True), ], ) @pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50]) -def test_selected_top_k_regressors(model, top_k, ts_with_regressors_and_features): +def test_selected_top_k_regressors(model, top_k, ts_with_regressors): """Check that transform selects exactly top_k regressors if where are this much.""" - ts = ts_with_regressors_and_features - all_regressors = ts_with_regressors_and_features.regressors + ts = ts_with_regressors + all_regressors = ts_with_regressors.regressors selector = TreeFeatureSelectionTransform(model=model, top_k=top_k) selector.fit_transform(ts) @@ -147,13 +147,13 @@ def test_selected_top_k_regressors(model, top_k, ts_with_regressors_and_features RandomForestRegressor(n_estimators=10, random_state=42), ExtraTreesRegressor(n_estimators=10, random_state=42), GradientBoostingRegressor(n_estimators=10, random_state=42), - CatBoostRegressor(iterations=10, random_state=42, silent=True, cat_features=["segment_code"]), + CatBoostRegressor(iterations=10, random_state=42, silent=True), ], ) @pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50]) -def test_retain_values(model, top_k, ts_with_regressors_and_features): +def test_retain_values(model, top_k, ts_with_regressors): """Check that transform doesn't change values of columns.""" - ts = ts_with_regressors_and_features + ts = ts_with_regressors df_encoded = ts.to_pandas() selector = TreeFeatureSelectionTransform(model=model, top_k=top_k) df_selected = selector.fit_transform(ts).to_pandas() @@ -216,13 +216,13 @@ def test_warns_no_regressors(model, example_tsds): RandomForestRegressor(n_estimators=10, random_state=42), ExtraTreesRegressor(n_estimators=10, random_state=42), GradientBoostingRegressor(n_estimators=10, random_state=42), - CatBoostRegressor(iterations=700, random_state=42, silent=True, cat_features=["segment_code"]), + CatBoostRegressor(iterations=700, random_state=42, silent=True), ], ) -def test_sanity_selected(model, ts_with_regressors_and_features): +def test_sanity_selected(model, ts_with_regressors): """Check that transform correctly finds meaningful regressors.""" - ts = ts_with_regressors_and_features - selector = TreeFeatureSelectionTransform(model=model, top_k=8) + ts = ts_with_regressors + selector = TreeFeatureSelectionTransform(model=model, top_k=10) df_selected = selector.fit_transform(ts).to_pandas() features_columns = df_selected.columns.get_level_values("feature").unique() selected_regressors = [column for column in features_columns if column.startswith("regressor_")]