From f200512abbd666a90ad6b15772ebd28d79d4cdfd Mon Sep 17 00:00:00 2001 From: AnthonyCampbell208 <78286293+AnthonyCampbell208@users.noreply.github.com> Date: Fri, 30 Jun 2023 16:33:41 -0400 Subject: [PATCH 01/19] Adding model selection functionality Signed-off-by: AnthonyCampbell208 <78286293+AnthonyCampbell208@users.noreply.github.com> Co-authored-by: ShrutiRM97 <98553136+ShrutiRM97@users.noreply.github.com> Co-authored-by: CooperGibbs --- econml/dml/dml.py | 68 ++- econml/new_tests/test_model_selection.py | 273 +++++++++ .../new_tests/test_model_selection_utils.py | 235 ++++++++ econml/sklearn_extensions/model_selection.py | 252 +++++++- .../model_selection_utils.py | 563 ++++++++++++++++++ econml/tests/test_dml.py | 12 +- econml/utilities.py | 24 +- 7 files changed, 1399 insertions(+), 28 deletions(-) create mode 100644 econml/new_tests/test_model_selection.py create mode 100644 econml/new_tests/test_model_selection_utils.py create mode 100644 econml/sklearn_extensions/model_selection_utils.py diff --git a/econml/dml/dml.py b/econml/dml/dml.py index ce579d519..7ff5ad354 100644 --- a/econml/dml/dml.py +++ b/econml/dml/dml.py @@ -34,6 +34,8 @@ reshape_treatmentwise_effects, shape, transpose, get_feature_names_or_default, filter_none_kwargs) from .._shap import _shap_explain_model_cate +from ..sklearn_extensions.model_selection import SearchEstimatorList +import pdb class _FirstStageWrapper: @@ -356,6 +358,14 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn The estimator for fitting the response residuals to the treatment residuals. Must implement `fit` and `predict` methods, and must be a linear model for correctness. + param_list: list or 'auto', default 'auto' + The list of parameters to be used during cross-validation. + If 'auto', it will be chosen based on the model type. + + scaling: bool, default True + Whether to scale the features during the estimation process. + Scaling can help improve the performance of some models. + featurizer: :term:`transformer`, optional Must support fit_transform and transform. Used to create composite features in the final CATE regression. It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X). @@ -380,6 +390,9 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). The first category will be treated as the control treatment. + verbose: int, default 2 + The verbosity level of the output messages. Higher values indicate more verbosity. + cv: int, cross-validation generator or an iterable, default 2 Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -469,13 +482,19 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn def __init__(self, *, model_y, model_t, model_final, + param_list_y=None, + param_list_t=None, + scaling=False, featurizer=None, treatment_featurizer=None, fit_cate_intercept=True, linear_first_stages=False, discrete_treatment=False, categories='auto', + verbose=2, # New cv=2, + grid_folds=2, # New + n_jobs=None, # New mc_iters=None, mc_agg='mean', random_state=None, @@ -487,6 +506,13 @@ def __init__(self, *, # since we clone it and fit separate copies self.fit_cate_intercept = fit_cate_intercept self.linear_first_stages = linear_first_stages + self.scaling = scaling + self.param_list_y = param_list_y + self.param_list_t = param_list_t + self.verbose = verbose + self.cv = cv + self.grid_folds = grid_folds + self.n_jobs = n_jobs self.featurizer = clone(featurizer, safe=False) self.model_y = clone(model_y, safe=False) self.model_t = clone(model_t, safe=False) @@ -508,23 +534,37 @@ def _gen_allowed_missing_vars(self): def _gen_featurizer(self): return clone(self.featurizer, safe=False) - def _gen_model_y(self): + def _gen_model_y(self): # New if self.model_y == 'auto': - model_y = WeightedLassoCVWrapper(random_state=self.random_state) + model_y = SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y, + scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state) else: - model_y = clone(self.model_y, safe=False) + model_y = clone(SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y, + scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state), safe=False) + # model_y = clone(self.model_y, safe=False) return _FirstStageWrapper(model_y, True, self._gen_featurizer(), self.linear_first_stages, self.discrete_treatment) - def _gen_model_t(self): + def _gen_model_t(self): # New + # import pdb + # pdb.set_trace() if self.model_t == 'auto': if self.discrete_treatment: - model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) + model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, + scaling=self.scaling, verbose=self.verbose, cv=WeightedStratifiedKFold(random_state=self.random_state), is_discrete=self.discrete_treatment, + n_jobs=self.n_jobs, random_state=self.random_state) else: - model_t = WeightedLassoCVWrapper(random_state=self.random_state) + model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, + scaling=self.scaling, verbose=self.verbose, cv=self.cv, is_discrete=self.discrete_treatment, + n_jobs=self.n_jobs, random_state=self.random_state) + # model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), + # model_t = WeightedLassoCVWrapper(random_state=self.random_state) else: - model_t = clone(self.model_t, safe=False) + model_t = clone(SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, + scaling=self.scaling, verbose=self.verbose, cv=self.cv, is_discrete=self.discrete_treatment, + n_jobs=self.n_jobs, random_state=self.random_state), safe=False) + # model_t = clone(self.model_t, safe=False) + return _FirstStageWrapper(model_t, False, self._gen_featurizer(), self.linear_first_stages, self.discrete_treatment) @@ -716,13 +756,19 @@ class LinearDML(StatsModelsCateEstimatorMixin, DML): def __init__(self, *, model_y='auto', model_t='auto', + param_list_y=None, + param_list_t=None, featurizer=None, treatment_featurizer=None, fit_cate_intercept=True, linear_first_stages=True, discrete_treatment=False, categories='auto', + scaling=True, + verbose=2, cv=2, + grid_folds=2, + n_jobs=None, mc_iters=None, mc_agg='mean', random_state=None, @@ -733,6 +779,8 @@ def __init__(self, *, super().__init__(model_y=model_y, model_t=model_t, + param_list_y=param_list_y, + param_list_t=param_list_t, model_final=None, featurizer=featurizer, treatment_featurizer=treatment_featurizer, @@ -740,7 +788,11 @@ def __init__(self, *, linear_first_stages=linear_first_stages, discrete_treatment=discrete_treatment, categories=categories, + scaling=scaling, + verbose=verbose, cv=cv, + n_jobs=n_jobs, + grid_folds=grid_folds, mc_iters=mc_iters, mc_agg=mc_agg, random_state=random_state, diff --git a/econml/new_tests/test_model_selection.py b/econml/new_tests/test_model_selection.py new file mode 100644 index 000000000..b007ddd21 --- /dev/null +++ b/econml/new_tests/test_model_selection.py @@ -0,0 +1,273 @@ +import unittest + +import numpy as np +from econml.sklearn_extensions.model_selection import * +from econml.sklearn_extensions.model_selection_utils import * +from sklearn.datasets import fetch_california_housing, load_iris +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, f1_score +from sklearn.pipeline import make_pipeline +from sklearn.svm import SVR + + +class TestSearchEstimatorListClassifier(unittest.TestCase): + def setUp(self): + self.expected_accuracy = 0.9 + self.expected_f1_score = 0.9 + self.accuracy_tolerance = 0.05 + self.f1_score_tolerance = 0.05 + self.is_discrete = True + X, y = load_iris(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=42) + self.X_train = X_train + self.y_train = y_train + self.X_test = X_test + self.y_test = y_test + + def test_initialization(self): + with self.assertRaises(ValueError): + SearchEstimatorList(estimator_list='invalid_estimator') + + def test_auto_param_grid_discrete(self): + + search_estimator_list = SearchEstimatorList(is_discrete=self.is_discrete, scaling=False) + search_estimator_list.fit(self.X_train, self.y_train) + self.assertIsNotNone(search_estimator_list.best_estimator_) + self.assertIsNotNone(search_estimator_list.best_score_) + self.assertIsNotNone(search_estimator_list.best_params_) + + def test_linear_estimator(self): + search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, scaling=False) + search.fit(self.X_train, self.y_train) + y_pred = search.predict(self.X_test) + acc = accuracy_score(self.y_test, y_pred) + f1 = f1_score(self.y_test, y_pred, average='macro') + + self.assertEqual(len(search.complete_estimator_list), 1) + self.assertEqual(len(search.param_grid_list), 1) + self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) + + self.assertGreaterEqual(acc, self.expected_accuracy) + self.assertGreaterEqual(f1, self.expected_f1_score) + + def test_poly_estimator(self): + search = SearchEstimatorList(estimator_list='poly', is_discrete=self.is_discrete, scaling=False) + search.fit(self.X_train, self.y_train) + y_pred = search.predict(self.X_test) + acc = accuracy_score(self.y_test, y_pred) + f1 = f1_score(self.y_test, y_pred, average='macro') + + self.assertEqual(len(search.complete_estimator_list), 1) + self.assertTrue(is_polynomial_pipeline(search.complete_estimator_list[0])) + + self.assertGreaterEqual(acc, self.expected_accuracy) + self.assertGreaterEqual(f1, self.expected_f1_score) + + def test_forest_estimator(self): + search = SearchEstimatorList(estimator_list='forest', is_discrete=self.is_discrete, scaling=False) + search.fit(self.X_train, self.y_train) + y_pred = search.predict(self.X_test) + acc = accuracy_score(self.y_test, y_pred) + f1 = f1_score(self.y_test, y_pred, average='macro') + + self.assertEqual(len(search.complete_estimator_list), 1) + self.assertEqual(len(search.param_grid_list), 1) + self.assertIsInstance(search.complete_estimator_list[0], RandomForestClassifier) + + self.assertGreaterEqual(acc, self.expected_accuracy) + self.assertGreaterEqual(f1, self.expected_f1_score) + + def test_gbf_estimator(self): + search = SearchEstimatorList(estimator_list='gbf', is_discrete=self.is_discrete, scaling=False) + search.fit(self.X_train, self.y_train) + y_pred = search.predict(self.X_test) + acc = accuracy_score(self.y_test, y_pred) + f1 = f1_score(self.y_test, y_pred, average='macro') + + self.assertEqual(len(search.complete_estimator_list), 1) + self.assertEqual(len(search.param_grid_list), 1) + self.assertIsInstance(search.complete_estimator_list[0], GradientBoostingClassifier) + + self.assertGreaterEqual(acc, self.expected_accuracy) + self.assertGreaterEqual(f1, self.expected_f1_score) + + def test_nnet_estimator(self): + search = SearchEstimatorList(estimator_list='nnet', is_discrete=self.is_discrete, scaling=False) + search.fit(self.X_train, self.y_train) + y_pred = search.predict(self.X_test) + acc = accuracy_score(self.y_test, y_pred) + f1 = f1_score(self.y_test, y_pred, average='macro') + + self.assertEqual(len(search.complete_estimator_list), 1) + self.assertEqual(len(search.param_grid_list), 1) + self.assertIsInstance(search.complete_estimator_list[0], MLPClassifier) + + self.assertGreaterEqual(acc, self.expected_accuracy) + self.assertGreaterEqual(f1, self.expected_f1_score) + + def test_linear_and_forest_estimators(self): + search = SearchEstimatorList(estimator_list=['linear', 'forest'], is_discrete=self.is_discrete, scaling=False) + search.fit(self.X_train, self.y_train) + y_pred = search.predict(self.X_test) + acc = accuracy_score(self.y_test, y_pred) + f1 = f1_score(self.y_test, y_pred, average='macro') + + self.assertEqual(len(search.complete_estimator_list), 2) + self.assertEqual(len(search.param_grid_list), 2) + self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) + self.assertIsInstance(search.complete_estimator_list[1], RandomForestClassifier) + + self.assertGreaterEqual(acc, self.expected_accuracy) + self.assertGreaterEqual(f1, self.expected_f1_score) + + def test_all_estimators(self): + search = SearchEstimatorList(estimator_list=['linear', 'forest', + 'gbf', 'nnet', 'poly'], is_discrete=self.is_discrete, scaling=False) + search.fit(self.X_train, self.y_train) + y_pred = search.predict(self.X_test) + acc = accuracy_score(self.y_test, y_pred) + f1 = f1_score(self.y_test, y_pred, average='macro') + + self.assertEqual(len(search.complete_estimator_list), 5) + self.assertEqual(len(search.param_grid_list), 5) + + self.assertGreaterEqual(acc, self.expected_accuracy) + self.assertGreaterEqual(f1, self.expected_f1_score) + + def test_logistic_regression_estimator(self): + search = SearchEstimatorList(estimator_list=LogisticRegression(), is_discrete=self.is_discrete, scaling=False) + search.fit(self.X_train, self.y_train) + y_pred = search.predict(self.X_test) + acc = accuracy_score(self.y_test, y_pred) + f1 = f1_score(self.y_test, y_pred, average='macro') + self.assertGreaterEqual(acc, self.expected_accuracy) + self.assertGreaterEqual(f1, self.expected_f1_score) + + def test_logistic_regression_cv_estimator(self): + search = SearchEstimatorList(estimator_list=LogisticRegressionCV(), + is_discrete=self.is_discrete, scaling=False) + search.fit(self.X_train, self.y_train) + y_pred = search.predict(self.X_test) + acc = accuracy_score(self.y_test, y_pred) + f1 = f1_score(self.y_test, y_pred, average='macro') + self.assertGreaterEqual(acc, self.expected_accuracy) + self.assertGreaterEqual(f1, self.expected_f1_score) + + def test_empty_estimator_list(self): + with self.assertRaises(ValueError): + search = SearchEstimatorList(estimator_list=[], is_discrete=self.is_discrete, scaling=False) + + def test_invalid_regressor(self): + with self.assertRaises(TypeError): + estimator_list = [SVR(kernel='linear')] + search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) + + def test_polynomial_pipeline_regressor(self): + with self.assertRaises(TypeError): + estimator_list = [make_pipeline(PolynomialFeatures(), ElasticNetCV())] + search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) + + def test_mlp_regressor(self): + with self.assertRaises(TypeError): + estimator_list = [MLPRegressor()] + search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) + + def test_random_forest_regressor(self): + with self.assertRaises(TypeError): + estimator_list = [RandomForestRegressor()] + search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) + + def test_gradient_boosting_regressor(self): + with self.assertRaises(TypeError): + estimator_list = [GradientBoostingRegressor()] + search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) + + def test_combined_estimators(self): + with self.assertRaises(TypeError): + estimator_list = [LogisticRegression(), SVC(), GradientBoostingRegressor()] + search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) + + def test_random_forest_discrete(self): + estimator_list = [RandomForestClassifier()] + param_grid_list = [{'n_estimators': [10, 50, 100], 'max_depth': [3, 5, None]}] + + search = SearchEstimatorList( + estimator_list=estimator_list, param_grid_list=param_grid_list, is_discrete=self.is_discrete, scaling=False) + search.fit(self.X_train, self.y_train) + + self.assertEqual(len(search.complete_estimator_list), 1) + self.assertEqual(len(search.param_grid_list), 1) + + self.assertIsNotNone(search.best_estimator_) + self.assertIsNotNone(search.best_score_) + self.assertIsNotNone(search.best_params_) + + def test_data_scaling(self): + search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, scaling=True) + search.fit(self.X_train, self.y_train) + y_pred = search.predict(self.X_test) + acc = accuracy_score(self.y_test, y_pred) + f1 = f1_score(self.y_test, y_pred, average='macro') + + self.assertEqual(len(search.complete_estimator_list), 1) + self.assertEqual(len(search.param_grid_list), 1) + self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) + + self.assertGreaterEqual(acc, self.expected_accuracy) + self.assertGreaterEqual(f1, self.expected_f1_score) + + def test_custom_scoring_function(self): + def custom_scorer(y_true, y_pred): + return f1_score(y_true, y_pred, average='macro') + + search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, + scaling=False, scoring=custom_scorer) + search.fit(self.X_train, self.y_train) + y_pred = search.predict(self.X_test) + acc = accuracy_score(self.y_test, y_pred) + f1 = f1_score(self.y_test, y_pred, average='macro') + + self.assertEqual(len(search.complete_estimator_list), 1) + self.assertEqual(len(search.param_grid_list), 1) + self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) + + self.assertGreaterEqual(acc, self.expected_accuracy) + self.assertGreaterEqual(f1, self.expected_f1_score) + + # def test_refit_false(self): + # search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, scaling=False, refit=False) + # search.fit(self.X_train, self.y_train) + # with self.assertRaises(NotFittedError): + # y_pred = search.predict(self.X_test) + + def test_custom_random_state(self): + search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, + scaling=False, random_state=42) + search.fit(self.X_train, self.y_train) + y_pred = search.predict(self.X_test) + acc = accuracy_score(self.y_test, y_pred) + f1 = f1_score(self.y_test, y_pred, average='macro') + + self.assertEqual(len(search.complete_estimator_list), 1) + self.assertEqual(len(search.param_grid_list), 1) + self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) + + self.assertGreaterEqual(acc, self.expected_accuracy) + self.assertGreaterEqual(f1, self.expected_f1_score) + + def test_invalid_custom_scoring_function(self): + with self.assertRaises(ValueError): + search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, + scaling=False, scoring='invalid_scorer') + + def test_invalid_incorrect_scoring_numbers(self): + with self.assertRaises(ValueError): + search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, + scaling=False, scoring=123) + + + + +if __name__ == '__main__': + unittest.main() diff --git a/econml/new_tests/test_model_selection_utils.py b/econml/new_tests/test_model_selection_utils.py new file mode 100644 index 000000000..8e7e7c917 --- /dev/null +++ b/econml/new_tests/test_model_selection_utils.py @@ -0,0 +1,235 @@ +import unittest + +import numpy as np +from econml.sklearn_extensions.model_selection import * +from econml.sklearn_extensions.model_selection_utils import * +from sklearn.datasets import fetch_california_housing, load_iris +from sklearn.preprocessing import StandardScaler, PolynomialFeatures +from sklearn.model_selection import train_test_split +from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV + + +class TestIsDataScaled(unittest.TestCase): + + def test_scaled_data(self): + # Test with data that is already centered and scaled + X = np.array([[0.0, -1.0], [1.0, 0.0], [-1.0, 1.0]]) + scale = StandardScaler() + scaled_X = scale.fit_transform(X) + self.assertTrue(is_data_scaled(scaled_X)) + + def test_unscaled_data(self): + # Test with data that is not centered and scaled + X = np.array([[1.0, 2.0], [2.0, 3.0], [3.0, 4.0]]) + self.assertFalse(is_data_scaled(X)) + + def test_large_scaled_data(self): + # Test with a larger dataset that is already centered and scaled + np.random.seed(42) + X = np.random.randn(1000, 5) + scale = StandardScaler() + scaled_X = scale.fit_transform(X) + self.assertTrue(is_data_scaled(scaled_X)) + + def test_large_unscaled_data(self): + np.random.seed(42) + X = np.random.randn(1000, 5) + self.assertFalse(is_data_scaled(X)) + + def test_is_data_scaled_with_scaled_iris_dataset(self): + X, y = load_iris(return_X_y=True) + scaler = StandardScaler() + X_scaled = scaler.fit_transform(X) + assert is_data_scaled(X_scaled) == True + + def test_is_data_scaled_with_unscaled_iris_dataset(self): + X, y = load_iris(return_X_y=True) + assert is_data_scaled(X) == False + + def test_is_data_scaled_with_scaled_california_housing_dataset(self): + X, y = housing = fetch_california_housing(return_X_y=True) + scaler = StandardScaler() + X_scaled = scaler.fit_transform(X) + assert is_data_scaled(X_scaled) == True + + def test_is_data_scaled_with_unscaled_california_housing_dataset(self): + X, y = fetch_california_housing(return_X_y=True) + assert is_data_scaled(X) == False + + +class TestFlattenList(unittest.TestCase): + + def test_flatten_empty_list(self): + input = [] + expected_output = [] + self.assertEqual(flatten_list(input), expected_output) + + def test_flatten_simple_list(self): + input = [1, 10, 15] + expected_output = [1, 10, 15] + self.assertEqual(flatten_list(input), expected_output) + + def test_flatten_nested_list(self): + input = [1, [10, 15], [20, [25, 30]]] + expected_output = [1, 10, 15, 20, 25, 30] + self.assertEqual(flatten_list(input), expected_output) + + # Check functionality for below + # def test_flatten_none_list(self): + # input = [[1, 10, None], 15, None] + # expected_output = [1, 10, None, 15, None] + # self.assertEqual(flatten_list(input), expected_output) + + def test_flatten_iris_dataset(self): + X = load_iris() + input = X.data.tolist() + expected_output = sum(X.data.tolist(), []) + self.assertEqual(flatten_list(input), expected_output) + + def test_flatten_california_housing_dataset(self): + X = fetch_california_housing() + input = X.data.tolist() + expected_output = sum(X.data.tolist(), []) + self.assertEqual(flatten_list(input), expected_output) + + +class TestIsPolynomialPipeline(unittest.TestCase): + + def test_is_polynomial_pipeline_true(self): + X = np.array([[5, 10], [15, 20], [25, 30], [35, 40], [45, 50]]) + y = np.array([15, 29, 38, 47, 55]) + scaler = StandardScaler() + X_scaled = scaler.fit_transform(X) + model = Pipeline([ + ('poly', PolynomialFeatures(degree=2)), + ('linear', ElasticNetCV()) + ]) + model.fit(X_scaled, y) + assert is_polynomial_pipeline(model) == True + + def test_is_polynomial_pipeline_false(self): + model = ElasticNetCV() + assert is_polynomial_pipeline(model) == False + + def test_is_polynomial_pipeline_false_step_number(self): + X, y = load_iris(return_X_y=True) + model = Pipeline([ + ('poly', PolynomialFeatures(degree=2)), + ('linear', LogisticRegressionCV()), + ('step_false', '') + ]) + assert is_polynomial_pipeline(model) == False + + def test_is_polynomial_pipeline_interchange_steps(self): + X, y = load_iris(return_X_y=True) + model = Pipeline([ + ('poly', LogisticRegressionCV()), + ('linear', PolynomialFeatures(degree=2)), + ]) + assert is_polynomial_pipeline(model) == False + + # Cross-check functionaity - can the 'poly' keyword be changed to something else + def test_is_polynomial_pipeline_false_first_step(self): + X, y = fetch_california_housing(return_X_y=True) + model = Pipeline([ + ('not_poly', PolynomialFeatures(degree=2)), + ('linear', ElasticNetCV()) + ]) + assert is_polynomial_pipeline(model) == True + + +class TestCheckListType(unittest.TestCase): + + def test_check_list_type_true(self): + list = ['linear', LogisticRegressionCV(), KFold()] + assert check_list_type(list) == True + + def test_check_list_type_false_string(self): + list = [18, LogisticRegressionCV(), KFold()] + try: + check_list_type(list) + except TypeError as e: + assert str(e) == "The list must contain only strings, sklearn model objects, and sklearn model selection objects." + + def test_check_list_type_empty(self): + list = [] + try: + check_list_type(list) + except ValueError as e: + assert str(e) == "Estimator list is empty. Please add some models or use some of the defaults provided." + + def test_check_list_type_all_strings(self): + list = ['linear', 'lasso', 'forest'] + assert check_list_type(list) == True + + def test_check_list_type_all_models(self): + list = [LogisticRegressionCV(), ElasticNetCV()] + assert check_list_type(list) == True + + def test_check_list_duplicate_models_strings(self): + list = [LogisticRegressionCV(), LogisticRegressionCV(), 'linear', 'linear'] + assert check_list_type(list) == True + + +class TestSelectContinuousEstimator(unittest.TestCase): + + def test_select_continuous_estimator_valid(self): + assert isinstance(select_continuous_estimator('linear'), ElasticNetCV) + assert isinstance(select_continuous_estimator('forest'), RandomForestRegressor) + assert isinstance(select_continuous_estimator('gbf'), GradientBoostingRegressor) + assert isinstance(select_continuous_estimator('nnet'), MLPRegressor) + assert isinstance(select_continuous_estimator('poly'), Pipeline) + + def test_select_continuous_estimator_invalid(self): + try: + select_continuous_estimator('ridge') + except ValueError as e: + assert str(e) == 'Unsupported estimator type: ridge' + + +class TestSelectDiscreteEstimator(unittest.TestCase): + + def test_select_discrete_estimator_valid(self): + assert isinstance(select_discrete_estimator('linear'), LogisticRegressionCV) + assert isinstance(select_discrete_estimator('forest'), RandomForestClassifier) + assert isinstance(select_discrete_estimator('gbf'), GradientBoostingClassifier) + assert isinstance(select_discrete_estimator('nnet'), MLPClassifier) + assert isinstance(select_discrete_estimator('poly'), Pipeline) + + def test_select_discrete_estimator_invalid(self): + try: + select_discrete_estimator('lasso') + except ValueError as e: + assert str(e) == 'Unsupported estimator type: lasso' + + +class TestSelectEstimator(unittest.TestCase): + + def test_select_estimator_valid(self): + assert isinstance(select_estimator('linear', is_discrete=False), ElasticNetCV) + assert isinstance(select_estimator('forest', is_discrete=False), RandomForestRegressor) + assert isinstance(select_estimator('gbf', is_discrete=False), GradientBoostingRegressor) + assert isinstance(select_estimator('nnet', is_discrete=False), MLPRegressor) + assert isinstance(select_estimator('poly', is_discrete=False), Pipeline) + + assert isinstance(select_estimator('linear', is_discrete=True), LogisticRegression) + assert isinstance(select_estimator('forest', is_discrete=True), RandomForestClassifier) + assert isinstance(select_estimator('gbf', is_discrete=True), GradientBoostingClassifier) + assert isinstance(select_estimator('nnet', is_discrete=True), MLPClassifier) + assert isinstance(select_estimator('poly', is_discrete=True), Pipeline) + + def test_select_estimator_invalid_estimator(self): + try: + select_estimator('lasso', is_discrete=True) + except ValueError as e: + assert str(e) == 'Unsupported estimator type: lasso' + + def test_select_estimator_invalid(self): + try: + select_estimator('linear', is_discrete=None) + except ValueError as e: + assert str(e) == 'Unsupported target type: None' + + +if __name__ == '__main__': + unittest.main() diff --git a/econml/sklearn_extensions/model_selection.py b/econml/sklearn_extensions/model_selection.py index 79b714bbc..e9c82ddc2 100644 --- a/econml/sklearn_extensions/model_selection.py +++ b/econml/sklearn_extensions/model_selection.py @@ -1,25 +1,30 @@ # Copyright (c) PyWhy contributors. All rights reserved. # Licensed under the MIT License. - """Collection of scikit-learn extensions for model selection techniques.""" import numbers +import pdb import warnings -import sklearn -from sklearn.base import BaseEstimator -from sklearn.utils.multiclass import type_of_target + import numpy as np import scipy.sparse as sp +import sklearn from joblib import Parallel, delayed -from sklearn.base import clone, is_classifier -from sklearn.model_selection import KFold, StratifiedKFold, check_cv, GridSearchCV +from sklearn.base import BaseEstimator, clone, is_classifier +from sklearn.exceptions import FitFailedWarning +from sklearn.model_selection import (BaseCrossValidator, GridSearchCV, KFold, + RandomizedSearchCV, StratifiedKFold, + check_cv) # TODO: conisder working around relying on sklearn implementation details from sklearn.model_selection._validation import (_check_is_permutation, _fit_and_predict) from sklearn.preprocessing import LabelEncoder -from sklearn.utils import indexable, check_random_state +from sklearn.utils import check_random_state, indexable +from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _num_samples +from econml.sklearn_extensions.model_selection_utils import * + def _split_weighted_sample(self, X, y, sample_weight, is_stratified=False): random_state = self.random_state if self.shuffle else None @@ -256,6 +261,216 @@ def get_n_splits(self, X, y, groups=None): return self.n_splits +class SearchEstimatorList(BaseEstimator): + """ + The SearchEstimatorList is a utility class for hyperparameter tuning. + It provides a convenient way to perform GridSearch cross-validation for + a list of estimators. The class automates the process of hyperparameter + tuning, model fitting, and prediction for multiple estimators. + + + Parameters + ---------- + estimator_list : list, string, or sklearn model object, default ['linear', 'forest'] + A list of names of estimators to be used for grid search. + + param_grid_list : list or 'auto', default 'auto' + A list of dictionaries specifying hyperparameters for each estimator in `estimator_list`. If set to 'auto', the class automatically generates hyperparameters for the estimators. + + scaling : bool, default True + Indicates whether to scale the input data using StandardScaler. + + is_discrete : bool, default False + Specifies if the models in `estimator_list` are discrete. + + scoring : str or None, default None + The scoring metric to be used for selecting the best estimator. + + n_jobs : int or None, default None + The number of CPU cores to use for parallel processing during grid search. + + refit : bool, default True + Determines whether to refit the best estimator with the entire dataset after grid search. + + grid_folds : int, default 3 + Number of folds for the cross-validation during grid search. Must be at least 2. + + verbose : int, default 2 + Verbosity level of the class's methods and inner workings. + + pre_dispatch : str, default '2*n_jobs' + Controls the number of jobs that get dispatched during parallel execution of the grid search. + + random_state : int, RandomState instance, or None, default None + If int, `random_state` is the seed used by the random number generator; + If `RandomState` instance, `random_state` is the random number generator; + If None, the random number generator is the `RandomState` instance used by `np.random`. Used when `shuffle` == True. + + error_score : float or 'raise', default np.nan + The value assigned to the score if an error occurs during fitting an estimator. If set to 'raise', an error is raised. + + return_train_score : bool, default False + Determines whether to include training scores in the `cv_results_` attribute of the class. + + categorical_indices : str, int, list, or None default None + List of categorical indices + """ + + def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, scaling=False, is_discrete=False, scoring=None, + n_jobs=None, refit=True, cv=2, verbose=2, pre_dispatch='2*n_jobs', random_state=None, + error_score=np.nan, return_train_score=False, categorical_indices=None): + # pdb.set_trace() + self.estimator_list = estimator_list + self.complete_estimator_list = get_complete_estimator_list( + clone(estimator_list, safe=False), is_discrete=is_discrete, random_state=random_state) + + # TODO Add in more functionality by checking if it's an empty list. If it's just 1 dictionary then we're going to need to turn it into a list + # Just do more cases + if param_grid_list == 'auto': + self.param_grid_list = auto_hyperparameters( + estimator_list=self.complete_estimator_list, is_discrete=is_discrete) + elif (param_grid_list == None): + self.param_grid_list = len(self.complete_estimator_list) * [{}] + else: + self.param_grid_list = param_grid_list + self.categorical_indices = categorical_indices + self.scoring = scoring + if scoring == None: + if is_discrete: + self.scoring = 'f1_macro' + else: + self.scoring = 'neg_mean_squared_error' + warnings.warn(f"No scoring value was given. Using default score method {self.scoring}.") + self.scaling = scaling + self.n_jobs = n_jobs + self.refit = refit + self.cv = cv + self.verbose = verbose + self.random_state = random_state + self.pre_dispatch = pre_dispatch + self.error_score = error_score + self.return_train_score = return_train_score + self.is_discrete = is_discrete + + def fit(self, X, y, *, sample_weight=None, groups=None): + # print(groups) + # if groups != None: + # pdb.set_trace() + + self._search_list = [] + # pdb.set_trace() + # Change estimators if multi_task + if is_likely_multi_task(y): + for index, estimator in enumerate(self.complete_estimator_list): + if not can_handle_multitask(model=estimator, is_discrete=self.is_discrete): + self.complete_estimator_list[index] = make_model_multi_task( + model=estimator, is_discrete=self.is_discrete) + if self.param_grid_list != None: + self.param_grid_list[index] = make_param_multi_task( + estimator=estimator, param_grid=self.param_grid_list[index]) + + if self.scaling: + if not is_data_scaled(X): + self.scaler = StandardScaler() + scaled_X = self.scaler.fit_transform(X) + + if just_one_model_no_params(estimator_list=self.complete_estimator_list, param_list=self.param_grid_list): + # Just fit the model and return it, no need for Grid search or for loop + estimator = self.complete_estimator_list[0] + if self.random_state != None: + if has_random_state(model=estimator): + # For a polynomial pipeline, you have to set the random state of the linear part, the polynomial part doesn't have random state + if is_polynomial_pipeline(estimator): + estimator = estimator.set_params(linear__random_state=self.random_state) + else: + estimator.set_params(random_state=self.random_state) + if is_polynomial_pipeline(estimator=estimator): + # Only linear part of pipeline can handle sampleweight + estimator.fit(X, y, linear__sample_weight=sample_weight) + elif not supports_sample_weight(estimator=estimator): + estimator.fit(X, y) + else: + estimator.fit(X, y, sample_weight=sample_weight) + self.best_ind_ = None + self.best_estimator_ = estimator + self.best_score_ = None + self.best_params_ = {} + return self + for estimator, param_grid in zip(self.complete_estimator_list, self.param_grid_list): + try: + if self.random_state != None: + if has_random_state(model=estimator): + # For a polynomial pipeline, you have to set the random state of the linear part, the polynomial part doesn't have random state + if is_polynomial_pipeline(estimator): + estimator = estimator.set_params(linear__random_state=self.random_state) + else: + estimator.set_params(random_state=self.random_state) + print(estimator) # Note Delete this + print(param_grid) # Note Delete this + # pdb.set_trace() # Note Delete this + temp_search = GridSearchCV(estimator, param_grid, scoring=self.scoring, + n_jobs=self.n_jobs, refit=self.refit, cv=self.cv, verbose=self.verbose, + pre_dispatch=self.pre_dispatch, error_score=self.error_score, + return_train_score=self.return_train_score) + if self.scaling: + # Add sample weights to the linear layer, not the polynomial featurizer + if is_polynomial_pipeline(estimator=estimator): + temp_search.fit(scaled_X, y, groups=groups, linear__sample_weight=sample_weight) + # MLP does not have sample weight so we cannot fit the search + elif is_mlp(estimator=estimator): + temp_search.fit(scaled_X, y, groups=groups) + else: + temp_search.fit(scaled_X, y, groups=groups, sample_weight=sample_weight) + self._search_list.append(temp_search) + else: + if is_polynomial_pipeline(estimator=estimator): + temp_search.fit(X, y, groups=groups, linear__sample_weight=sample_weight) + elif not supports_sample_weight(estimator=estimator): + temp_search.fit(X, y, groups=groups) + else: + temp_search.fit(X, y, groups=groups, sample_weight=sample_weight) + self._search_list.append(temp_search) + except (ValueError, TypeError, FitFailedWarning) as e: + # This warning catches errors during the fit operation. + warning_msg = f"Warning: {e} for estimator {estimator} and param_grid {param_grid}" + warnings.warn(warning_msg, category=UserWarning) + if not hasattr(temp_search, 'cv_results_') and not param_grid_is_empty(param_grid=param_grid): + # This warning catches a problem after fit has run with no exception, however if there is no cv_results_ this indicates a failed fit operation. + warning_msg = f"Warning: estimator {estimator} and param_grid {param_grid} failed has no attribute cv_results_." + warnings.warn(warning_msg, category=FitFailedWarning) + + self.best_ind_ = np.argmax([search.best_score_ for search in self._search_list]) + self.best_estimator_ = self._search_list[self.best_ind_].best_estimator_ + self.best_score_ = self._search_list[self.best_ind_].best_score_ + self.best_params_ = self._search_list[self.best_ind_].best_params_ + print( + f'Best estimator {self.best_estimator_} and best score {self.best_score_} and best params {self.best_params_}') + return self + + def scaler_transform(self, X): + if self.scaling: + return self.scaler.transform(X) + + def best_model(self): + return self.best_estimator_ + + def predict(self, X): + if self.scaling: + return self.best_estimator_.predict(self.scaler.transform(X)) + return self.best_estimator_.predict(X) + + def predict_proba(self, X): + return self.best_estimator_.predict_proba(X) + + def refit(self, X, y): + # Refits the best estimator using the entire dataset. + if self.best_estimator_ is None: + raise ValueError("No best estimator found. Please call the 'fit' method before calling 'refit'.") + + self.best_estimator_.fit(X, y) + return self + + class GridSearchCVList(BaseEstimator): """ An extension of GridSearchCV that allows for passing a list of estimators each with their own parameter grid and returns the best among all estimators in the list and hyperparameter in their @@ -279,14 +494,20 @@ class GridSearchCVList(BaseEstimator): of parameter settings. """ - def __init__(self, estimator_list, param_grid_list, scoring=None, + def __init__(self, estimator_list=['linear', 'forest'], param_grid_list='auto', scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', - error_score=np.nan, return_train_score=False): - self.estimator_list = estimator_list - self.param_grid_list = param_grid_list + error_score=np.nan, return_train_score=False, is_discrete=False): + # 'discrete' if is_discrete else 'continuous' + self.estimator_list = get_complete_estimator_list(estimator_list, is_discrete, ) + if param_grid_list == 'auto': + self.param_grid_list = auto_hyperparameters(estimator_list=self.estimator_list, is_discrete=is_discrete) + elif (param_grid_list == None): + self.param_grid_list = len(self.estimator_list) * [{}] + else: + self.param_grid_list = param_grid_list self.scoring = scoring self.n_jobs = n_jobs - self.refit = refit + # self.refit = refit self.cv = cv self.verbose = verbose self.pre_dispatch = pre_dispatch @@ -296,7 +517,7 @@ def __init__(self, estimator_list, param_grid_list, scoring=None, def fit(self, X, y=None, **fit_params): self._gcv_list = [GridSearchCV(estimator, param_grid, scoring=self.scoring, - n_jobs=self.n_jobs, refit=self.refit, cv=self.cv, verbose=self.verbose, + n_jobs=self.n_jobs, cv=self.cv, verbose=self.verbose, pre_dispatch=self.pre_dispatch, error_score=self.error_score, return_train_score=self.return_train_score) for estimator, param_grid in zip(self.estimator_list, self.param_grid_list)] @@ -306,6 +527,9 @@ def fit(self, X, y=None, **fit_params): self.best_params_ = self._gcv_list[self.best_ind_].best_params_ return self + def best_model(self): + return self.best_estimator_ + def predict(self, X): return self.best_estimator_.predict(X) @@ -313,7 +537,7 @@ def predict_proba(self, X): return self.best_estimator_.predict_proba(X) -def _cross_val_predict(estimator, X, y=None, *, groups=None, cv=None, +def _cross_val_predict(estimator, X, y=None, *, groups=None, cv=3, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict', safe=True): """This is a fork from :meth:`~sklearn.model_selection.cross_val_predict` to allow for diff --git a/econml/sklearn_extensions/model_selection_utils.py b/econml/sklearn_extensions/model_selection_utils.py new file mode 100644 index 000000000..7aced8728 --- /dev/null +++ b/econml/sklearn_extensions/model_selection_utils.py @@ -0,0 +1,563 @@ + +import pdb +import warnings +from sklearn.exceptions import NotFittedError +import numpy as np +import sklearn +import sklearn.ensemble +import sklearn.linear_model +import sklearn.neural_network +import sklearn.preprocessing +from sklearn.base import BaseEstimator, is_regressor, is_classifier +from sklearn.ensemble import (GradientBoostingClassifier, + GradientBoostingRegressor, + RandomForestClassifier, RandomForestRegressor) +from sklearn.linear_model import (ElasticNetCV, + LogisticRegression, + LogisticRegressionCV, MultiTaskElasticNetCV) +from sklearn.model_selection import (BaseCrossValidator, GridSearchCV, + RandomizedSearchCV, + check_cv) +from sklearn.neural_network import MLPClassifier, MLPRegressor +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import (PolynomialFeatures, + StandardScaler) +from sklearn.svm import SVC, LinearSVC +import inspect +from sklearn.exceptions import NotFittedError +from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier +from sklearn.model_selection import KFold +# from sklearn_extensions.model_selection import WeightedStratifiedKFold + + +def select_continuous_estimator(estimator_type, random_state): + """ + Returns a continuous estimator object for the specified estimator type. + + Parameters + ---------- + estimator_type (str): The type of estimator to use, one of: 'linear', 'forest', 'gbf', 'nnet', 'poly'. + TODO Add Random State for parameter + Returns + ---------- + object: An instance of the selected estimator class. + + Raises: + ValueError: If the estimator type is unsupported. + """ + if estimator_type == 'linear': + return (ElasticNetCV(random_state=random_state)) + elif estimator_type == 'forest': + return RandomForestRegressor(random_state=random_state) + elif estimator_type == 'gbf': + return GradientBoostingRegressor(random_state=random_state) + elif estimator_type == 'nnet': + return (MLPRegressor(random_state=random_state)) + elif estimator_type == 'poly': + poly = PolynomialFeatures() + linear = ElasticNetCV(random_state=random_state) # Play around with precompute and tolerance + return (Pipeline([('poly', poly), ('linear', linear)])) + else: + raise ValueError(f"Unsupported estimator type: {estimator_type}") + + +def select_discrete_estimator(estimator_type, random_state): + """ + Returns a discrete estimator object for the specified estimator type. + + Parameters + ---------- + estimator_type (str): The type of estimator to use, one of: 'linear', 'forest', 'gbf', 'nnet', 'poly'. + TODO Add Random State for parameter + Returns + ---------- + object: An instance of the selected estimator class. + + Raises: + ValueError: If the estimator type is unsupported. + """ + + if estimator_type == 'linear': + return (LogisticRegressionCV(cv=KFold(random_state=random_state), + multi_class='auto', random_state=random_state)) + elif estimator_type == 'forest': + return RandomForestClassifier(random_state=random_state) + elif estimator_type == 'gbf': + return GradientBoostingClassifier(random_state=random_state) + elif estimator_type == 'nnet': + return (MLPClassifier(random_state=random_state)) + elif estimator_type == 'poly': + poly = PolynomialFeatures() + linear = (LogisticRegressionCV(cv=KFold(random_state=random_state), + multi_class='auto', random_state=random_state)) + return (Pipeline([('poly', poly), ('linear', linear)])) + else: + raise ValueError(f"Unsupported estimator type: {estimator_type}") + + +def select_estimator(estimator_type, is_discrete, random_state): + """ + Returns an estimator object for the specified estimator and target types. + + Parameters + ---------- + estimator_type (str): The type of estimator to use, one of: 'linear', 'forest', 'gbf', 'nnet', 'poly', 'automl', 'all'. + is_discrete (bool): The type of target variable, if true then it's discrete. + TODO Add Random State for parameter + Returns + ---------- + object: An instance of the selected estimator class. + + Raises: + ValueError: If the estimator or target types are unsupported. + """ + if not isinstance(is_discrete, bool): + raise ValueError(f"Unsupported target type: {type(is_discrete)}. is_discrete should be of type bool.") + elif is_discrete: + return select_discrete_estimator(estimator_type=estimator_type, random_state=random_state) + else: + return select_continuous_estimator(estimator_type=estimator_type, random_state=random_state) + + +def is_likely_estimator(estimator): + required_methods = ['fit', 'predict'] + return all(hasattr(estimator, method) for method in required_methods) or isinstance(estimator, BaseEstimator) + + +def check_list_type(lst): + """ + Checks if a list only contains strings, sklearn model objects, and sklearn model selection objects. + + Parameters + ---------- + lst (list): A list to check. + + Returns + ---------- + bool: True if the list only contains valid objects, False otherwise. + + Raises: + TypeError: If the list contains objects other than strings, sklearn model objects, or sklearn model selection objects. + + Examples: + >>> check_list_type(['linear', RandomForestRegressor(), KFold()]) + True + >>> check_list_type([1, 'linear']) + TypeError: The list must contain only strings, sklearn model objects, and sklearn model selection objects. + """ + if len(lst) == 0: + raise ValueError("Estimator list is empty. Please add some models or use some of the defaults provided.") + + # pdb.set_trace() + for element in lst: + if (not isinstance(element, (str, BaseCrossValidator))): + if not is_likely_estimator(element): + # pdb.set_trace() + raise TypeError( + f"The list must contain only strings, sklearn model objects, and sklearn model selection objects. Invalid element: {element}") + return True + + +def get_complete_estimator_list(estimator_list, is_discrete, random_state): + ''' + Returns a list of sklearn objects from an input list of str's, and sklearn objects. + + Parameters + ---------- + estimator_list : List of estimators; can be sklearn object or str: 'linear', 'forest', 'gbf', 'nnet', 'poly', 'auto', 'all'. + is_discrete (bool): if target type is discrete or continuous. + + Returns + ---------- + object: A list of sklearn objects + + Raises: + ValueError: If the estimator is not supported. + + ''' + # pdb.set_trace() + if isinstance(estimator_list, str): + if 'all' == estimator_list: + estimator_list = ['linear', 'forest', 'gbf', 'nnet', 'poly'] + elif 'auto' == estimator_list: + estimator_list = ['linear'] + elif estimator_list in ['linear', 'forest', 'gbf', 'nnet', 'poly']: + estimator_list = [estimator_list] + else: + raise ValueError( + "Invalid estimator_list value. Please provide a valid value from the list of available estimators: ['linear', 'forest', 'gbf', 'nnet', 'poly', 'automl']") + elif isinstance(estimator_list, list): + if 'auto' in estimator_list: + for estimator in ['linear']: + if estimator not in estimator_list: + estimator_list.append(estimator) + if 'all' in estimator_list: + for estimator in ['linear', 'forest', 'gbf', 'nnet', 'poly']: + if estimator not in estimator_list: + estimator_list.append(estimator) + + elif is_likely_estimator(estimator_list): + estimator_list = [estimator_list] + else: + raise ValueError(f"Incorrect type: {type(estimator_list)}") + check_list_type(estimator_list) + temp_est_list = [] + + if not isinstance(estimator_list, list): + raise ValueError(f"estimator_list should be of type list not: {type(estimator_list)}") + + # Set to remove duplicates + for estimator in set(estimator_list): + # if sklearn object: add to list, else turn str into corresponding sklearn object and add to list + if isinstance(estimator, BaseCrossValidator) or is_likely_estimator(estimator): + temp_est_list.append(estimator) + else: + temp_est_list.append(select_estimator(estimator_type=estimator, + is_discrete=is_discrete, random_state=random_state)) + temp_est_list = flatten_list(temp_est_list) + + # Check that all types of models are matched towards the problem. + # pdb.set_trace() + for estimator in temp_est_list: + if (isinstance(estimator, BaseEstimator)): + if not is_regressor_or_classifier(estimator, is_discrete=is_discrete): + raise TypeError("Invalid estimator type: {} - must be a regressor or classifier".format(type(estimator))) + return temp_est_list + + +def select_classification_hyperparameters(estimator): + """ + Returns a hyperparameter grid for the specified classification model type. + + Parameters + ---------- + model_type (str): The type of model to be used. Valid values are 'linear', 'forest', 'nnet', and 'poly'. + + Returns + ---------- + A dictionary representing the hyperparameter grid to search over. + """ + + if isinstance(estimator, LogisticRegressionCV): + return { + 'Cs': [0.01, 0.1, 1], + 'cv': [3], + 'penalty': ['l1', 'l2', 'elasticnet'], + 'solver': ['lbfgs', 'liblinear', 'saga'] + } + elif isinstance(estimator, RandomForestClassifier): + return { + 'n_estimators': [100, 500], + 'max_depth': [None, 5, 10, 20], + 'min_samples_split': [2, 5], + 'min_samples_leaf': [1, 2] + } + elif isinstance(estimator, GradientBoostingClassifier): + return { + 'n_estimators': [100, 500], + 'learning_rate': [0.01, 0.05, 0.1], + 'max_depth': [3, 5, 7], + + } + elif isinstance(estimator, MLPClassifier): + return { + 'hidden_layer_sizes': [(10,), (50,), (100,)], + 'activation': ['relu'], + 'solver': ['adam'], + 'alpha': [0.0001, 0.001, 0.01], + 'learning_rate': ['constant', 'adaptive'] + } + elif is_polynomial_pipeline(estimator=estimator): + return { + 'poly__degree': [2, 3, 4], + 'linear__Cs': [1, 10, 20], + 'linear__max_iter': [100, 200], + 'linear__penalty': ['l2'], + 'linear__solver': ['saga', 'liblinear', 'lbfgs'] + } + else: + warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for LogisticRegressionCV, RandomForestClassifier, MLPClassifier, and the polynomial pipleine", category=UserWarning) + return {} + # raise ValueError("Invalid model type. Valid values are 'linear', 'forest', 'nnet', and 'poly'.") + + +def select_regression_hyperparameters(estimator): + """ + Returns a dictionary of hyperparameters to be searched over for a regression model. + + Parameters + ---------- + model_type (str): The type of model to be used. Valid values are 'linear', 'forest', 'nnet', and 'poly'. + + Returns + ---------- + A dictionary of hyperparameters to be searched over using a grid search. + """ + if isinstance(estimator, ElasticNetCV): + return { + 'l1_ratio': [0.1, 0.5, 0.9], + 'cv': [3], + 'max_iter': [1000], + } + elif isinstance(estimator, RandomForestRegressor): + return { + 'n_estimators': [100], + 'max_depth': [None, 10, 50], + 'min_samples_split': [2, 5, 10], + } + elif isinstance(estimator, MLPRegressor): + return { + 'hidden_layer_sizes': [(10,), (50,), (100,)], + 'alpha': [0.0001, 0.001, 0.01], + 'learning_rate': ['constant', 'adaptive'] + } + elif isinstance(estimator, GradientBoostingRegressor): + return { + 'n_estimators': [100, 500], + 'learning_rate': [0.01, 0.1, 0.05], + 'max_depth': [3, 5], + } + elif is_polynomial_pipeline(estimator=estimator): + return { + 'linear__l1_ratio': [0.1, 0.5, 0.9], + 'linear__max_iter': [1000], + 'poly__degree': [2, 3, 4] + } + else: + warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for ElasticNetCV, RandomForestRegressor, MLPRegressor, and the polynomial pipeline.", category=UserWarning) + return {} + + +def flatten_list(lst): + """ + Flatten a list that may contain nested lists. + + Parameters + ---------- + lst (list): The list to flatten. + + Returns + ---------- + list: The flattened list. + """ + flattened = [] + for item in lst: + if isinstance(item, list): + flattened.extend(flatten_list(item)) + else: + flattened.append(item) + return flattened + + +def auto_hyperparameters(estimator_list, is_discrete=True): + """ + Selects hyperparameters for a list of estimators. + + Parameters + ---------- + - estimator_list: list of scikit-learn estimators + - is_discrete: boolean indicating whether the problem is classification or regression + + Returns + ---------- + - param_list: list of parameter grids for the estimators + """ + param_list = [] + for estimator in estimator_list: + if is_discrete: + param_list.append(select_classification_hyperparameters(estimator=estimator)) + else: + param_list.append(select_regression_hyperparameters(estimator=estimator)) + return param_list + + +def set_search_hyperparameters(search_object, hyperparameters): + if isinstance(search_object, (RandomizedSearchCV, GridSearchCV)): + search_object.set_params(**hyperparameters) + else: + raise ValueError("Invalid search object") + + +def is_mlp(estimator): + return isinstance(estimator, (MLPClassifier, MLPRegressor)) + + +def has_random_state(model): + if is_polynomial_pipeline(model): + signature = inspect.signature(type(model['linear'])) + else: + signature = inspect.signature(type(model)) + return ("random_state" in signature.parameters) + + +def supports_sample_weight(estimator): + fit_signature = inspect.signature(estimator.fit) + return 'sample_weight' in fit_signature.parameters + + +def just_one_model_no_params(estimator_list, param_list): + return (len(estimator_list) == 1) and (len(param_list) == 1) and (len(param_list[0]) == 0) + + +def param_grid_is_empty(param_grid): + return len(param_grid) == 0 + + +def is_linear_model(estimator): + """ + Check whether an estimator is a polynomial regression, logistic regression, linear SVM, or any other type of + linear model. + + Parameters + ---------- + estimator (scikit-learn estimator): The estimator to check. + + Returns + ---------- + is_linear (bool): True if the estimator is a linear model, False otherwise. + """ + + if isinstance(estimator, Pipeline): + has_poly_feature_step = any(isinstance(step[1], PolynomialFeatures) for step in estimator.steps) + if has_poly_feature_step: + return True + + if hasattr(estimator, 'fit_intercept') and hasattr(estimator, 'coef_'): + return True + + if isinstance(estimator, (LogisticRegression, LinearSVC, SVC)): + return True + + return False + + +def is_data_scaled(X): + """ + Check if the input data is already centered and scaled using StandardScaler. + + Parameters + ---------- + X array-like of shape (n_samples, n_features): The input data. + + Returns + ---------- + is_scaled (bool): Whether the input data is already centered and scaled using StandardScaler or not. + + """ + mean = np.mean(X, axis=0) + std = np.std(X, axis=0) + + is_scaled = np.allclose(mean, 0.0) and np.allclose(std, 1.0) + + return is_scaled + + +def is_regressor_or_classifier(model, is_discrete): + if is_discrete: + if is_polynomial_pipeline(model): + return is_classifier(model[1]) + else: + return is_classifier(model) + else: + if is_polynomial_pipeline(model): + return is_regressor(model[1]) + else: + return is_regressor(model) + + +def scale_pipeline(model): + """ + Returns a pipeline that scales the input data using StandardScaler and applies the given model. + + Parameters + ---------- + model : estimator object + A model object that implements the scikit-learn estimator interface. + + Returns + ---------- + pipe : Pipeline object + A pipeline that scales the input data using StandardScaler and applies the given model. + """ + pipe = Pipeline([('scaler', StandardScaler()), ('model', model)]) + return pipe + + +def is_polynomial_pipeline(estimator): + if not isinstance(estimator, Pipeline): + return False + steps = estimator.steps + if len(steps) != 2: + return False + poly_step = steps[0] + if not isinstance(poly_step[1], PolynomialFeatures): + return False + return True + + +def is_likely_multi_task(y): + if len(y.shape) == 2: + if y.shape[1] > 1: + return True + return False + + +def can_handle_multitask(model, is_discrete=False): + X = np.random.rand(10, 3) + if is_discrete: + y = np.random.randint(0, 2, (10, 2)) + else: + y = np.random.rand(10, 2) + + try: + model.fit(X, y) + except Exception as e: + return False + + try: + model.predict(X) + except Exception as e: + # warnings.warn(f"The model {model.__class__.__name__} is not properly fitted. Error: {e}") + return False + return True + + +def pipeline_convert_to_multitask(pipeline): + steps = list(pipeline.steps) + + if isinstance(steps[-1][1], (LogisticRegressionCV)): + steps[-1] = ('linear', MultiOutputClassifier(steps[-1][1])) + if isinstance(steps[-1][1], (ElasticNetCV)): + steps[-1] = ('linear', MultiTaskElasticNetCV()) + new_pipeline = Pipeline(steps) + + return new_pipeline + + +def make_model_multi_task(model, is_discrete): + try: + if is_discrete: + if is_polynomial_pipeline(model): + return pipeline_convert_to_multitask(model) + return MultiOutputClassifier(model) + else: + if isinstance(model, ElasticNetCV): + return MultiTaskElasticNetCV() + elif is_polynomial_pipeline(model): + return pipeline_convert_to_multitask(model) + else: + return MultiOutputRegressor(model) + except TypeError as e: + raise ValueError(f"An error occurred due to type mismatch: {e}") from e + except AttributeError as e: + raise ValueError(f"An error occurred due to attribute error: {e}") from e + except Exception as e: + raise ValueError("An unknown error occurred when making model multitask.") from e + + +def make_param_multi_task(estimator, param_grid): + if isinstance(estimator, ElasticNetCV): + return param_grid + else: + param_grid_multi = {f'estimator__{k}': v for k, v in param_grid.items()} + return param_grid_multi diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index 195b4615d..afb445ccd 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -22,7 +22,9 @@ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.multioutput import MultiOutputRegressor from econml.grf import MultiOutputGRF +from econml.sklearn_extensions.model_selection import SearchEstimatorList from econml.tests.utilities import (GroupingModel, NestedModel) +import pdb try: import ray @@ -623,9 +625,9 @@ def test_access_to_internal_models(self): assert isinstance(est.featurizer_, Pipeline) assert isinstance(est.model_cate, WeightedLasso) for mdl in est.models_y[0]: - assert isinstance(mdl, WeightedLasso) + assert isinstance(mdl, SearchEstimatorList) for mdl in est.models_t[0]: - assert isinstance(mdl, LogisticRegression) + assert isinstance(mdl, SearchEstimatorList) np.testing.assert_array_equal(est.cate_feature_names(['A']), ['A', 'A^2']) np.testing.assert_array_equal(est.cate_feature_names(), ['X0', 'X0^2']) est = DML(model_y=WeightedLasso(), @@ -639,9 +641,9 @@ def test_access_to_internal_models(self): assert isinstance(est.featurizer_, FunctionTransformer) assert isinstance(est.model_cate, WeightedLasso) for mdl in est.models_y[0]: - assert isinstance(mdl, WeightedLasso) + assert isinstance(mdl, SearchEstimatorList) for mdl in est.models_t[0]: - assert isinstance(mdl, LogisticRegression) + assert isinstance(mdl, SearchEstimatorList) np.testing.assert_array_equal(est.cate_feature_names(['A']), ['A']) def test_forest_dml_perf(self): @@ -1129,7 +1131,7 @@ def _test_sparse(n_p, d_w, n_r): model_t=LinearRegression(fit_intercept=False), fit_cate_intercept=False) dml.fit(y, t, X=x, W=w) - + # pdb.set_trace() np.testing.assert_allclose(a, dml.coef_.reshape(-1), atol=1e-1) eff = reshape(t * np.choose(np.tile(p, 2), a), (-1,)) np.testing.assert_allclose(eff, dml.effect(x, T0=0, T1=t), atol=1e-1) diff --git a/econml/utilities.py b/econml/utilities.py index 84c577a93..aa6145d96 100644 --- a/econml/utilities.py +++ b/econml/utilities.py @@ -30,6 +30,7 @@ from statsmodels.compat.python import lmap import copy from inspect import signature +from econml.sklearn_extensions.model_selection import SearchEstimatorList MAX_RAND_SEED = np.iinfo(np.int32).max @@ -950,8 +951,28 @@ def fit_with_groups(model, X, y, groups=None, **kwargs): kwargs : dict Any other named arguments to pass to the model's fit """ - + # import pdb + # pdb.set_trace() if groups is not None: + if isinstance(model, SearchEstimatorList): + for estimator in model.complete_estimator_list: + if hasattr(estimator, 'cv'): + old_cv = estimator.cv + # logic copied from check_cv + cv = 5 if old_cv is None else old_cv + if isinstance(cv, numbers.Integral): + cv = GroupKFold(cv) + # otherwise we will assume the user already set the cv attribute to something + # compatible with splitting with a 'groups' argument + + # now we have to compute the folds explicitly because some classifiers (like LassoCV) + # don't use the groups when calling split internally + splits = list(cv.split(X, y, groups=groups)) + try: + estimator.cv = splits + return estimator.fit(X, y, **kwargs) + finally: + estimator.cv = old_cv # assume that we should perform nested cross-validation if and only if # the model has a 'cv' attribute; this is a somewhat brittle assumption... if hasattr(model, 'cv'): @@ -967,6 +988,7 @@ def fit_with_groups(model, X, y, groups=None, **kwargs): # don't use the groups when calling split internally splits = list(cv.split(X, y, groups=groups)) try: + print(splits) model.cv = splits return model.fit(X, y, **kwargs) finally: From 30c290a6e52c7cc1e758ac9cb219452754150c9e Mon Sep 17 00:00:00 2001 From: AnthonyCampbell208 <78286293+AnthonyCampbell208@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:15:48 -0400 Subject: [PATCH 02/19] Fixed fitting with groups, fixed one param grid case, other bugs Signed-off-by: AnthonyCampbell208 <78286293+AnthonyCampbell208@users.noreply.github.com> --- econml/dml/dml.py | 18 +- econml/new_tests/test_model_selection.py | 8 +- econml/sklearn_extensions/model_selection.py | 11 +- .../model_selection_utils.py | 242 +++++++++++++++++- econml/utilities.py | 13 +- 5 files changed, 253 insertions(+), 39 deletions(-) diff --git a/econml/dml/dml.py b/econml/dml/dml.py index 7ff5ad354..515295444 100644 --- a/econml/dml/dml.py +++ b/econml/dml/dml.py @@ -541,7 +541,10 @@ def _gen_model_y(self): # New else: model_y = clone(SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y, scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state), safe=False) - # model_y = clone(self.model_y, safe=False) + # if self.model_y == 'auto': + # model_y = WeightedLassoCVWrapper(random_state=self.random_state) + # else: + # model_y = clone(self.model_y, safe=False) return _FirstStageWrapper(model_y, True, self._gen_featurizer(), self.linear_first_stages, self.discrete_treatment) @@ -557,14 +560,19 @@ def _gen_model_t(self): # New model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, scaling=self.scaling, verbose=self.verbose, cv=self.cv, is_discrete=self.discrete_treatment, n_jobs=self.n_jobs, random_state=self.random_state) - # model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - # model_t = WeightedLassoCVWrapper(random_state=self.random_state) + else: model_t = clone(SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, scaling=self.scaling, verbose=self.verbose, cv=self.cv, is_discrete=self.discrete_treatment, n_jobs=self.n_jobs, random_state=self.random_state), safe=False) - # model_t = clone(self.model_t, safe=False) - + # if self.model_t == 'auto': + # if self.discrete_treatment: + # model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), + # random_state=self.random_state) + # else: + # model_t = WeightedLassoCVWrapper(random_state=self.random_state) + # else: + # model_t = clone(self.model_t, safe=False) return _FirstStageWrapper(model_t, False, self._gen_featurizer(), self.linear_first_stages, self.discrete_treatment) diff --git a/econml/new_tests/test_model_selection.py b/econml/new_tests/test_model_selection.py index b007ddd21..1eb82db0b 100644 --- a/econml/new_tests/test_model_selection.py +++ b/econml/new_tests/test_model_selection.py @@ -256,17 +256,11 @@ def test_custom_random_state(self): self.assertGreaterEqual(acc, self.expected_accuracy) self.assertGreaterEqual(f1, self.expected_f1_score) - def test_invalid_custom_scoring_function(self): - with self.assertRaises(ValueError): - search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, - scaling=False, scoring='invalid_scorer') - + def test_invalid_incorrect_scoring_numbers(self): with self.assertRaises(ValueError): search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, scaling=False, scoring=123) - - if __name__ == '__main__': diff --git a/econml/sklearn_extensions/model_selection.py b/econml/sklearn_extensions/model_selection.py index e9c82ddc2..0e667cee2 100644 --- a/econml/sklearn_extensions/model_selection.py +++ b/econml/sklearn_extensions/model_selection.py @@ -332,7 +332,10 @@ def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, sc elif (param_grid_list == None): self.param_grid_list = len(self.complete_estimator_list) * [{}] else: - self.param_grid_list = param_grid_list + if isinstance(param_grid_list, dict): + self.param_grid_list = [param_grid_list] + else: + self.param_grid_list = param_grid_list self.categorical_indices = categorical_indices self.scoring = scoring if scoring == None: @@ -356,9 +359,9 @@ def fit(self, X, y, *, sample_weight=None, groups=None): # print(groups) # if groups != None: # pdb.set_trace() - - self._search_list = [] # pdb.set_trace() + self._search_list = [] + # Change estimators if multi_task if is_likely_multi_task(y): for index, estimator in enumerate(self.complete_estimator_list): @@ -375,7 +378,7 @@ def fit(self, X, y, *, sample_weight=None, groups=None): scaled_X = self.scaler.fit_transform(X) if just_one_model_no_params(estimator_list=self.complete_estimator_list, param_list=self.param_grid_list): - # Just fit the model and return it, no need for Grid search or for loop + # Just fit the model and return it, no need for grid search or for loop estimator = self.complete_estimator_list[0] if self.random_state != None: if has_random_state(model=estimator): diff --git a/econml/sklearn_extensions/model_selection_utils.py b/econml/sklearn_extensions/model_selection_utils.py index 7aced8728..0ab0f87c1 100644 --- a/econml/sklearn_extensions/model_selection_utils.py +++ b/econml/sklearn_extensions/model_selection_utils.py @@ -120,6 +120,22 @@ def select_estimator(estimator_type, is_discrete, random_state): def is_likely_estimator(estimator): + """ + Check if an object is likely to be an estimator. + + This function checks if an object has 'fit' and 'predict' methods, or if it is an instance of BaseEstimator. + + Parameters + ---------- + estimator : object + The object to check. + + Returns + ------- + bool + True if the object is likely to be an estimator, False otherwise. + """ + required_methods = ['fit', 'predict'] return all(hasattr(estimator, method) for method in required_methods) or isinstance(estimator, BaseEstimator) @@ -383,6 +399,22 @@ def is_mlp(estimator): def has_random_state(model): + """ + Check if a model has a 'random_state' parameter. + + This function inspects the model's signature to check if it has a 'random_state' parameter. + + Parameters + ---------- + model : object + The model to check. + + Returns + ------- + bool + True if the model has a 'random_state' parameter, False otherwise. + """ + if is_polynomial_pipeline(model): signature = inspect.signature(type(model['linear'])) else: @@ -391,30 +423,84 @@ def has_random_state(model): def supports_sample_weight(estimator): + """ + Check if a model supports 'sample_weight'. + + This function inspects the signature of the model's 'fit' method to check if it supports 'sample_weight'. + + Parameters + ---------- + model : object + The model to check. + + Returns + ------- + bool + True if the model supports 'sample_weight', False otherwise. + """ + fit_signature = inspect.signature(estimator.fit) return 'sample_weight' in fit_signature.parameters def just_one_model_no_params(estimator_list, param_list): + """ + Check if there is only one model and the parameter list is empty. + + This function checks if the length of the model and parameter list is 1 and 0 respectively. + + Parameters + ---------- + estimator_list : list + List of models. + + param_list : list + List of parameters. + + Returns + ------- + bool + True if there is only one model and the parameter list is empty, False otherwise. + """ + return (len(estimator_list) == 1) and (len(param_list) == 1) and (len(param_list[0]) == 0) def param_grid_is_empty(param_grid): + """ + Check if a parameter grid is empty. + + This function checks if the length of the parameter grid is 0. + + Parameters + ---------- + param_grid : dict + Parameter grid to check. + + Returns + ------- + bool + True if the parameter grid is empty, False otherwise. + """ + return len(param_grid) == 0 def is_linear_model(estimator): """ - Check whether an estimator is a polynomial regression, logistic regression, linear SVM, or any other type of - linear model. + Check if a model is a linear model. + + This function checks if a model has 'fit_intercept' and 'coef_' attributes or if it is an instance of LogisticRegression, LinearSVC, or SVC. Parameters ---------- - estimator (scikit-learn estimator): The estimator to check. + model : object + The model to check. Returns - ---------- - is_linear (bool): True if the estimator is a linear model, False otherwise. + ------- + bool + True if the model is a linear model, False otherwise. """ if isinstance(estimator, Pipeline): @@ -433,15 +519,19 @@ def is_linear_model(estimator): def is_data_scaled(X): """ - Check if the input data is already centered and scaled using StandardScaler. + Check if input data is scaled. + + This function checks if the input data is scaled by comparing its mean and standard deviation to 0 and 1 respectively. Parameters ---------- - X array-like of shape (n_samples, n_features): The input data. + X : array-like of shape (n_samples, n_features) + Input data. Returns - ---------- - is_scaled (bool): Whether the input data is already centered and scaled using StandardScaler or not. + ------- + bool + True if the input data is scaled, False otherwise. """ mean = np.mean(X, axis=0) @@ -453,6 +543,25 @@ def is_data_scaled(X): def is_regressor_or_classifier(model, is_discrete): + """ + Check if a model is a regressor or classifier. + + This function checks if a model is a regressor or classifier depending on the 'is_discrete' parameter. + + Parameters + ---------- + model : object + The model to check. + + is_discrete : bool + If True, checks if the model is a classifier. If False, checks if the model is a regressor. + + Returns + ------- + bool + True if the model matches the type specified by 'is_discrete', False otherwise. + """ + if is_discrete: if is_polynomial_pipeline(model): return is_classifier(model[1]) @@ -484,6 +593,22 @@ def scale_pipeline(model): def is_polynomial_pipeline(estimator): + """ + Check if a model is a polynomial pipeline. + + This function checks if a model is a pipeline that includes a PolynomialFeatures step. + + Parameters + ---------- + model : object + The model to check. + + Returns + ------- + bool + True if the model is a polynomial pipeline, False otherwise. + """ + if not isinstance(estimator, Pipeline): return False steps = estimator.steps @@ -496,6 +621,22 @@ def is_polynomial_pipeline(estimator): def is_likely_multi_task(y): + """ + Check if a target array is likely multi-task. + + This function checks if a target array is likely to be multi-task by checking its shape. + + Parameters + ---------- + y : array-like + The target array to check. + + Returns + ------- + bool + True if the target array is likely multi-task, False otherwise. + """ + if len(y.shape) == 2: if y.shape[1] > 1: return True @@ -503,6 +644,22 @@ def is_likely_multi_task(y): def can_handle_multitask(model, is_discrete=False): + """ + Check if a model can handle multi-task output. + + This function checks if a model can handle multi-task output by trying to fit and predict on random data. + + Parameters + ---------- + model : object + The model to check. + + Returns + ------- + bool + True if the model can handle multi-task output, False otherwise. + """ + X = np.random.rand(10, 3) if is_discrete: y = np.random.randint(0, 2, (10, 2)) @@ -523,8 +680,31 @@ def can_handle_multitask(model, is_discrete=False): def pipeline_convert_to_multitask(pipeline): - steps = list(pipeline.steps) + """ + Convert a pipeline to handle multi-task output if possible. + + This function iterates over the steps in the input pipeline. If a step is a + polynomial transformer, it adds the step to the new pipeline as is. If the + step is an estimator, it attempts to convert it to handle multi-task output + and adds the converted estimator to the new pipeline. + + Parameters + ---------- + pipeline : sklearn.Pipeline + The pipeline to convert. + + Returns + ------- + sklearn.Pipeline + The converted pipeline. + + Raises + ------ + ValueError + If an unknown error occurs when making model multi-task. + """ + steps = list(pipeline.steps) if isinstance(steps[-1][1], (LogisticRegressionCV)): steps[-1] = ('linear', MultiOutputClassifier(steps[-1][1])) if isinstance(steps[-1][1], (ElasticNetCV)): @@ -535,6 +715,25 @@ def pipeline_convert_to_multitask(pipeline): def make_model_multi_task(model, is_discrete): + """ + Convert a model to handle multi-task output if possible. + + This function converts a model to handle multi-task output if possible. + + Parameters + ---------- + model : object + The model to convert. + + is_discrete : bool + If True, the model is treated as a classifier. If False, the model is treated as a regressor. + + Returns + ------- + object + The converted model if possible, raises an error otherwise. + """ + try: if is_discrete: if is_polynomial_pipeline(model): @@ -547,15 +746,30 @@ def make_model_multi_task(model, is_discrete): return pipeline_convert_to_multitask(model) else: return MultiOutputRegressor(model) - except TypeError as e: - raise ValueError(f"An error occurred due to type mismatch: {e}") from e - except AttributeError as e: - raise ValueError(f"An error occurred due to attribute error: {e}") from e except Exception as e: raise ValueError("An unknown error occurred when making model multitask.") from e def make_param_multi_task(estimator, param_grid): + """ + Convert the keys in a parameter grid to work with a multi-task model. + + This function converts the keys in a parameter grid to work with a multi-task model by prepending 'estimator__' to each key. + + Parameters + ---------- + estimator : object + The estimator the parameter grid is for. + + param_grid : dict + The parameter grid to convert. + + Returns + ------- + dict + The converted parameter grid. + """ + if isinstance(estimator, ElasticNetCV): return param_grid else: diff --git a/econml/utilities.py b/econml/utilities.py index aa6145d96..008bfc244 100644 --- a/econml/utilities.py +++ b/econml/utilities.py @@ -955,27 +955,22 @@ def fit_with_groups(model, X, y, groups=None, **kwargs): # pdb.set_trace() if groups is not None: if isinstance(model, SearchEstimatorList): + # SearchEstimatorList must be handled different. Each estimator must be changed for CV else the functionality isn't the same + # It does have a CV but it does not work if you just change the CV of the SearchEstimatorList for estimator in model.complete_estimator_list: if hasattr(estimator, 'cv'): old_cv = estimator.cv - # logic copied from check_cv cv = 5 if old_cv is None else old_cv if isinstance(cv, numbers.Integral): cv = GroupKFold(cv) - # otherwise we will assume the user already set the cv attribute to something - # compatible with splitting with a 'groups' argument - - # now we have to compute the folds explicitly because some classifiers (like LassoCV) - # don't use the groups when calling split internally splits = list(cv.split(X, y, groups=groups)) try: estimator.cv = splits - return estimator.fit(X, y, **kwargs) - finally: + except: estimator.cv = old_cv # assume that we should perform nested cross-validation if and only if # the model has a 'cv' attribute; this is a somewhat brittle assumption... - if hasattr(model, 'cv'): + elif hasattr(model, 'cv'): old_cv = model.cv # logic copied from check_cv cv = 5 if old_cv is None else old_cv From 55c585809d3103570e817cd367d7733f4baefc0d Mon Sep 17 00:00:00 2001 From: AnthonyCampbell208 <78286293+AnthonyCampbell208@users.noreply.github.com> Date: Thu, 10 Aug 2023 17:06:29 -0400 Subject: [PATCH 03/19] Final commit, added encoding for categorical data (untested) and added notebook to showcase some of the functionality Signed-off-by: AnthonyCampbell208 <78286293+AnthonyCampbell208@users.noreply.github.com> --- econml/dml/dml.py | 14 +- econml/sklearn_extensions/model_selection.py | 23 +- .../model_selection_utils.py | 47 +- .../SearchEstimatorList functionality.ipynb | 1031 +++++++++++++++++ 4 files changed, 1090 insertions(+), 25 deletions(-) create mode 100644 notebooks/SearchEstimatorList functionality.ipynb diff --git a/econml/dml/dml.py b/econml/dml/dml.py index 515295444..d7c59013b 100644 --- a/econml/dml/dml.py +++ b/econml/dml/dml.py @@ -484,6 +484,8 @@ def __init__(self, *, model_y, model_t, model_final, param_list_y=None, param_list_t=None, + scoring_y=None, + scoring_t=None, scaling=False, featurizer=None, treatment_featurizer=None, @@ -509,6 +511,8 @@ def __init__(self, *, self.scaling = scaling self.param_list_y = param_list_y self.param_list_t = param_list_t + self.scoring_y = scoring_y + self.scoring_t = scoring_t self.verbose = verbose self.cv = cv self.grid_folds = grid_folds @@ -536,10 +540,10 @@ def _gen_featurizer(self): def _gen_model_y(self): # New if self.model_y == 'auto': - model_y = SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y, + model_y = SearchEstimatorList(estimator_list=WeightedLassoCVWrapper(random_state=self.random_state), param_grid_list=self.param_list_y, scoring=self.scoring_y, scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state) else: - model_y = clone(SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y, + model_y = clone(SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y, scoring=self.scoring_y, scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state), safe=False) # if self.model_y == 'auto': # model_y = WeightedLassoCVWrapper(random_state=self.random_state) @@ -549,15 +553,13 @@ def _gen_model_y(self): # New self.linear_first_stages, self.discrete_treatment) def _gen_model_t(self): # New - # import pdb - # pdb.set_trace() if self.model_t == 'auto': if self.discrete_treatment: - model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, + model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, scoring=self.scoring_t, scaling=self.scaling, verbose=self.verbose, cv=WeightedStratifiedKFold(random_state=self.random_state), is_discrete=self.discrete_treatment, n_jobs=self.n_jobs, random_state=self.random_state) else: - model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, + model_t = SearchEstimatorList(estimator_list=WeightedLassoCVWrapper(random_state=self.random_state), param_grid_list=self.param_list_t, scoring=self.scoring_t, scaling=self.scaling, verbose=self.verbose, cv=self.cv, is_discrete=self.discrete_treatment, n_jobs=self.n_jobs, random_state=self.random_state) diff --git a/econml/sklearn_extensions/model_selection.py b/econml/sklearn_extensions/model_selection.py index 0e667cee2..d8c55538d 100644 --- a/econml/sklearn_extensions/model_selection.py +++ b/econml/sklearn_extensions/model_selection.py @@ -354,6 +354,7 @@ def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, sc self.error_score = error_score self.return_train_score = return_train_score self.is_discrete = is_discrete + self.supported_models = ['linear', 'forest', 'gbf', 'nnet', 'poly'] def fit(self, X, y, *, sample_weight=None, groups=None): # print(groups) @@ -400,6 +401,11 @@ def fit(self, X, y, *, sample_weight=None, groups=None): self.best_params_ = {} return self for estimator, param_grid in zip(self.complete_estimator_list, self.param_grid_list): + if self.verbose: + if is_polynomial_pipeline(estimator): + print(f"Processing estimator: {type(estimator.named_steps['linear']).__name__}") + else: + print(f"Processing estimator: {type(estimator).__name__}") try: if self.random_state != None: if has_random_state(model=estimator): @@ -408,8 +414,6 @@ def fit(self, X, y, *, sample_weight=None, groups=None): estimator = estimator.set_params(linear__random_state=self.random_state) else: estimator.set_params(random_state=self.random_state) - print(estimator) # Note Delete this - print(param_grid) # Note Delete this # pdb.set_trace() # Note Delete this temp_search = GridSearchCV(estimator, param_grid, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, cv=self.cv, verbose=self.verbose, @@ -441,8 +445,11 @@ def fit(self, X, y, *, sample_weight=None, groups=None): # This warning catches a problem after fit has run with no exception, however if there is no cv_results_ this indicates a failed fit operation. warning_msg = f"Warning: estimator {estimator} and param_grid {param_grid} failed has no attribute cv_results_." warnings.warn(warning_msg, category=FitFailedWarning) - - self.best_ind_ = np.argmax([search.best_score_ for search in self._search_list]) + try: + self.best_ind_ = np.argmax([search.best_score_ for search in self._search_list]) + except Exception as e: + warning_msg = f"Failed for estimator {estimator} and param_grid {param_grid} with this error {e}." + raise Exception(warning_msg) from e self.best_estimator_ = self._search_list[self.best_ind_].best_estimator_ self.best_score_ = self._search_list[self.best_ind_].best_score_ self.best_params_ = self._search_list[self.best_ind_].best_params_ @@ -465,14 +472,6 @@ def predict(self, X): def predict_proba(self, X): return self.best_estimator_.predict_proba(X) - def refit(self, X, y): - # Refits the best estimator using the entire dataset. - if self.best_estimator_ is None: - raise ValueError("No best estimator found. Please call the 'fit' method before calling 'refit'.") - - self.best_estimator_.fit(X, y) - return self - class GridSearchCVList(BaseEstimator): """ An extension of GridSearchCV that allows for passing a list of estimators each with their own diff --git a/econml/sklearn_extensions/model_selection_utils.py b/econml/sklearn_extensions/model_selection_utils.py index 0ab0f87c1..477731600 100644 --- a/econml/sklearn_extensions/model_selection_utils.py +++ b/econml/sklearn_extensions/model_selection_utils.py @@ -27,7 +27,7 @@ from sklearn.exceptions import NotFittedError from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier from sklearn.model_selection import KFold -# from sklearn_extensions.model_selection import WeightedStratifiedKFold +import pandas as pd def select_continuous_estimator(estimator_type, random_state): @@ -57,6 +57,9 @@ def select_continuous_estimator(estimator_type, random_state): poly = PolynomialFeatures() linear = ElasticNetCV(random_state=random_state) # Play around with precompute and tolerance return (Pipeline([('poly', poly), ('linear', linear)])) + elif estimator_type == 'weighted_lasso': + from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper + return WeightedLassoCVWrapper(random_state=random_state) else: raise ValueError(f"Unsupported estimator type: {estimator_type}") @@ -278,18 +281,15 @@ def select_classification_hyperparameters(estimator): elif isinstance(estimator, MLPClassifier): return { 'hidden_layer_sizes': [(10,), (50,), (100,)], - 'activation': ['relu'], - 'solver': ['adam'], - 'alpha': [0.0001, 0.001, 0.01], + 'alpha': [0.0001, 0.01], 'learning_rate': ['constant', 'adaptive'] } elif is_polynomial_pipeline(estimator=estimator): return { 'poly__degree': [2, 3, 4], - 'linear__Cs': [1, 10, 20], 'linear__max_iter': [100, 200], 'linear__penalty': ['l2'], - 'linear__solver': ['saga', 'liblinear', 'lbfgs'] + 'linear__solver': ['saga', 'lbfgs'] } else: warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for LogisticRegressionCV, RandomForestClassifier, MLPClassifier, and the polynomial pipleine", category=UserWarning) @@ -324,7 +324,7 @@ def select_regression_hyperparameters(estimator): elif isinstance(estimator, MLPRegressor): return { 'hidden_layer_sizes': [(10,), (50,), (100,)], - 'alpha': [0.0001, 0.001, 0.01], + 'alpha': [0.0001, 0.01], 'learning_rate': ['constant', 'adaptive'] } elif isinstance(estimator, GradientBoostingRegressor): @@ -775,3 +775,36 @@ def make_param_multi_task(estimator, param_grid): else: param_grid_multi = {f'estimator__{k}': v for k, v in param_grid.items()} return param_grid_multi + + +def preprocess_and_encode(data, cat_indices=None): + """ + Detects categorical columns, one-hot encodes them, and returns the preprocessed data. + + Parameters: + - data: pandas DataFrame or numpy array + - cat_indices: list of column indices (or names for DataFrame) to be considered categorical + + Returns: + - Preprocessed data in the format of the original input (DataFrame or numpy array) + """ + was_numpy = False + if isinstance(data, np.ndarray): + was_numpy = True + data = pd.DataFrame(data) + + # If cat_indices is None, detect categorical columns using object type as a heuristic + if cat_indices is None: + cat_columns = data.select_dtypes(['object']).columns.tolist() + else: + if all(isinstance(i, int) for i in cat_indices): # if cat_indices are integer indices + cat_columns = data.columns[cat_indices].tolist() + else: # assume cat_indices are column names + cat_columns = cat_indices + + data_encoded = pd.get_dummies(data, columns=cat_columns) + + if was_numpy: + return data_encoded.values + else: + return data_encoded diff --git a/notebooks/SearchEstimatorList functionality.ipynb b/notebooks/SearchEstimatorList functionality.ipynb new file mode 100644 index 000000000..4464199de --- /dev/null +++ b/notebooks/SearchEstimatorList functionality.ipynb @@ -0,0 +1,1031 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary packages\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_squared_error, accuracy_score\n", + "from sklearn.datasets import load_iris\n", + "from econml.sklearn_extensions.model_selection import SearchEstimatorList\n", + "import warnings\n", + "import numpy as np\n", + "from econml.dml import LinearDML, CausalForestDML\n", + "from econml.cate_interpreter import SingleTreeCateInterpreter, SingleTreePolicyInterpreter\n", + "import pandas as pd\n", + "from sklearn.preprocessing import PolynomialFeatures\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.exceptions import ConvergenceWarning\n", + "\n", + "# Ignore the ConvergenceWarning\n", + "warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SearchEstimatorList\n", + "\n", + "The SearchEstimatorList class is a custom Python class designed to streamline the process of training multiple machine learning models and tuning their hyperparameters. This class can be especially useful when you're unsure which model will perform best on your data and you want to compare several of them.\n", + "\n", + "# Key Features\n", + "\n", + " Multiple Model Training: The SearchEstimatorList class takes a list of Scikit-learn estimators (machine learning models) and trains each of them on your data.\n", + "\n", + " Hyperparameter Tuning: For each model, the class conducts a grid search over a provided range of hyperparameters. This allows you to automatically find the hyperparameters that result in the best model performance.\n", + "\n", + " Model Evaluation: The class retains the best performing model based on a specified scoring metric. This makes it easy to determine which model and hyperparameters are the most suitable for your data.\n", + "\n", + " Data Scaling: The SearchEstimatorList class also supports data scaling, which can be important for certain types of models.\n", + "\n", + " Handling of Different Target Types: This class handles both continuous and discrete target variables, making it suitable for both regression and classification tasks.\n", + "\n", + "# Usage\n", + "\n", + "To use the SearchEstimatorList class, you start by initializing an instance of the class with a list of models and their corresponding hyperparameter grids. Then, you call the fit method to train the models and conduct the grid search. After fitting, you can use the predict method to generate predictions for new data. The class also has methods to refit the best model using the entire dataset (refit) and to return the best model (best_model)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No scoring value was given. Using default score method f1_macro.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 2 folds for each of 3 candidates, totalling 6 fits\n", + "[CV 1/2] END ...................n_estimators=50;, score=0.916 total time= 0.1s\n", + "[CV 2/2] END ...................n_estimators=50;, score=0.950 total time= 0.1s\n", + "[CV 1/2] END ..................n_estimators=100;, score=0.916 total time= 0.1s\n", + "[CV 2/2] END ..................n_estimators=100;, score=0.950 total time= 0.1s\n", + "[CV 1/2] END ..................n_estimators=150;, score=0.916 total time= 0.1s\n", + "[CV 2/2] END ..................n_estimators=150;, score=0.950 total time= 0.1s\n", + "Fitting 2 folds for each of 9 candidates, totalling 18 fits\n", + "[CV 1/2] END learning_rate=0.01, n_estimators=50;, score=0.900 total time= 0.0s\n", + "[CV 2/2] END learning_rate=0.01, n_estimators=50;, score=0.950 total time= 0.0s\n", + "[CV 1/2] END learning_rate=0.01, n_estimators=100;, score=0.900 total time= 0.0s\n", + "[CV 2/2] END learning_rate=0.01, n_estimators=100;, score=0.950 total time= 0.1s\n", + "[CV 1/2] END learning_rate=0.01, n_estimators=150;, score=0.900 total time= 0.1s\n", + "[CV 2/2] END learning_rate=0.01, n_estimators=150;, score=0.950 total time= 0.1s\n", + "[CV 1/2] END learning_rate=0.1, n_estimators=50;, score=0.900 total time= 0.0s\n", + "[CV 2/2] END learning_rate=0.1, n_estimators=50;, score=0.950 total time= 0.0s\n", + "[CV 1/2] END learning_rate=0.1, n_estimators=100;, score=0.900 total time= 0.1s\n", + "[CV 2/2] END learning_rate=0.1, n_estimators=100;, score=0.933 total time= 0.1s\n", + "[CV 1/2] END learning_rate=0.1, n_estimators=150;, score=0.900 total time= 0.1s\n", + "[CV 2/2] END learning_rate=0.1, n_estimators=150;, score=0.933 total time= 0.1s\n", + "[CV 1/2] END ..learning_rate=1, n_estimators=50;, score=0.900 total time= 0.0s\n", + "[CV 2/2] END ..learning_rate=1, n_estimators=50;, score=0.933 total time= 0.0s\n", + "[CV 1/2] END .learning_rate=1, n_estimators=100;, score=0.900 total time= 0.1s\n", + "[CV 2/2] END .learning_rate=1, n_estimators=100;, score=0.933 total time= 0.1s\n", + "[CV 1/2] END .learning_rate=1, n_estimators=150;, score=0.900 total time= 0.1s\n", + "[CV 2/2] END .learning_rate=1, n_estimators=150;, score=0.933 total time= 0.1s\n", + "Best estimator RandomForestClassifier(n_estimators=50) and best score 0.9330819977445048 and best params {'n_estimators': 50}\n", + "Accuracy: 1.0\n" + ] + } + ], + "source": [ + "# Load the Iris dataset for classification\n", + "iris = load_iris()\n", + "\n", + "# Split the dataset into training and test sets\n", + "X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(\n", + " iris.data, iris.target, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "# Define models and their parameter grids\n", + "estimator_list_cls = ['forest', 'gbf']\n", + "param_grid_list_cls = [{'n_estimators': [50, 100, 150]}, {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 1]}]\n", + "\n", + "# Initialize SearchEstimatorList\n", + "sel_cls = SearchEstimatorList(\n", + " estimator_list=estimator_list_cls, \n", + " param_grid_list=param_grid_list_cls, \n", + " is_discrete=True,\n", + " verbose=3\n", + ")\n", + "\n", + "# Fit the model to the training data\n", + "sel_cls.fit(X_train_cls, y_train_cls)\n", + "\n", + "# Predict outcomes for the test set\n", + "predictions_cls = sel_cls.predict(X_test_cls)\n", + "\n", + "# Evaluate the model\n", + "acc = accuracy_score(y_test_cls, predictions_cls)\n", + "\n", + "# Print the evaluation metric\n", + "print(f\"Accuracy: {acc}\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Regressor" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 2 folds for each of 7 candidates, totalling 14 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/anthonycampbell/Documents/EconML-CS696DS/econml/sklearn_extensions/model_selection.py:346: UserWarning: No scoring value was given. Using default score method neg_mean_squared_error.\n", + " warnings.warn(f\"No scoring value was given. Using default score method {self.scoring}.\")\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV 1/2] END .....................l1_ratio=0.1;, score=-0.584 total time= 0.0s\n", + "[CV 2/2] END .....................l1_ratio=0.1;, score=-0.725 total time= 0.0s\n", + "[CV 1/2] END .....................l1_ratio=0.5;, score=-0.549 total time= 0.0s\n", + "[CV 2/2] END .....................l1_ratio=0.5;, score=-0.675 total time= 0.0s\n", + "[CV 1/2] END .....................l1_ratio=0.7;, score=-0.546 total time= 0.0s\n", + "[CV 2/2] END .....................l1_ratio=0.7;, score=-0.668 total time= 0.0s\n", + "[CV 1/2] END .....................l1_ratio=0.9;, score=-0.544 total time= 0.0s\n", + "[CV 2/2] END .....................l1_ratio=0.9;, score=-0.663 total time= 0.0s\n", + "[CV 1/2] END ....................l1_ratio=0.95;, score=-0.544 total time= 0.0s\n", + "[CV 2/2] END ....................l1_ratio=0.95;, score=-0.662 total time= 0.0s\n", + "[CV 1/2] END ....................l1_ratio=0.99;, score=-0.544 total time= 0.0s\n", + "[CV 2/2] END ....................l1_ratio=0.99;, score=-0.661 total time= 0.0s\n", + "[CV 1/2] END .......................l1_ratio=1;, score=-0.544 total time= 0.0s\n", + "[CV 2/2] END .......................l1_ratio=1;, score=-0.661 total time= 0.0s\n", + "Fitting 2 folds for each of 3 candidates, totalling 6 fits\n", + "[CV 1/2] END ............hidden_layer_sizes=50;, score=-0.712 total time= 1.0s\n", + "[CV 2/2] END ............hidden_layer_sizes=50;, score=-0.580 total time= 1.3s\n", + "[CV 1/2] END ...........hidden_layer_sizes=100;, score=-0.695 total time= 0.8s\n", + "[CV 2/2] END ...........hidden_layer_sizes=100;, score=-2.334 total time= 1.0s\n", + "[CV 1/2] END ...........hidden_layer_sizes=200;, score=-0.641 total time= 8.1s\n", + "[CV 2/2] END ...........hidden_layer_sizes=200;, score=-1.162 total time= 5.4s\n", + "Best estimator ElasticNetCV(l1_ratio=1) and best score -0.6025662427788023 and best params {'l1_ratio': 1}\n", + "Mean Squared Error: 0.5555752649052167\n" + ] + } + ], + "source": [ + "# Import necessary packages\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_squared_error, accuracy_score\n", + "from sklearn.datasets import fetch_california_housing\n", + "from econml.sklearn_extensions.model_selection import SearchEstimatorList\n", + "\n", + "# Load the Boston Housing dataset for regression\n", + "california_housing = fetch_california_housing()\n", + "\n", + "# Split the dataset into training and test sets\n", + "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(\n", + " california_housing.data, california_housing.target, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "# Define models and their parameter grids\n", + "# This will use ElasticNet because it's a Linear Model and a Neural Network Regressor\n", + "estimator_list_reg = ['linear', 'nnet']\n", + "param_grid_list_reg = [{'l1_ratio': [.1, .5, .7, .9, .95, .99, 1]}, {'hidden_layer_sizes': [50, 100, 200]}]\n", + "\n", + "# Initialize SearchEstimatorList\n", + "sel_reg = SearchEstimatorList(\n", + " estimator_list=estimator_list_reg, \n", + " param_grid_list=param_grid_list_reg,\n", + " is_discrete=False,\n", + " verbose=3\n", + ")\n", + "\n", + "# Fit the model to the training data\n", + "sel_reg.fit(X_train_reg, y_train_reg)\n", + "\n", + "# Predict outcomes for the test set\n", + "predictions_reg = sel_reg.predict(X_test_reg)\n", + "\n", + "# Evaluate the model\n", + "mse = mean_squared_error(y_test_reg, predictions_reg)\n", + "\n", + "# Print the evaluation metric\n", + "print(f\"Mean Squared Error: {mse}\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using all estimators" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/anthonycampbell/Documents/EconML-CS696DS/econml/sklearn_extensions/model_selection.py:346: UserWarning: No scoring value was given. Using default score method f1_macro.\n", + " warnings.warn(f\"No scoring value was given. Using default score method {self.scoring}.\")\n" + ] + } + ], + "source": [ + "search = SearchEstimatorList(estimator_list = ['linear', 'forest', 'gbf', 'nnet', 'poly'], is_discrete=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Single Estimators and Model Objects" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best estimator LogisticRegression(C=0.001, max_iter=50, penalty='none', solver='sag') and best score 0.966624895572264 and best params {'C': 0.001, 'max_iter': 50, 'penalty': 'none', 'solver': 'sag'}\n", + "LogisticRegression(C=0.001, max_iter=50, penalty='none', solver='sag')\n", + "{'C': 0.001, 'max_iter': 50, 'penalty': 'none', 'solver': 'sag'}\n", + "mse of test dataset: 0.0\n", + "[[7.30818687e-04 9.18278306e-01 8.09908750e-02]\n", + " [9.96517769e-01 3.48223146e-03 9.52705844e-13]\n", + " [8.11833119e-11 2.27064968e-04 9.99772935e-01]\n", + " [1.49082115e-03 8.82474441e-01 1.16034738e-01]\n", + " [6.61814371e-04 9.57060549e-01 4.22776371e-02]\n", + " [9.94291457e-01 5.70854348e-03 8.51181731e-12]\n", + " [3.09570872e-02 9.66175329e-01 2.86758338e-03]\n", + " [1.03620286e-04 2.72711857e-01 7.27184523e-01]\n", + " [1.86273814e-04 5.89659675e-01 4.10154051e-01]\n", + " [7.89829063e-03 9.84383361e-01 7.71834853e-03]\n", + " [1.79967697e-04 3.80342060e-01 6.19477972e-01]\n", + " [9.87625715e-01 1.23742845e-02 6.37903013e-11]\n", + " [9.97989545e-01 2.01045508e-03 2.71212460e-13]\n", + " [9.87073806e-01 1.29261936e-02 5.68033322e-11]\n", + " [9.97732149e-01 2.26785067e-03 1.43489213e-12]\n", + " [2.40047637e-03 9.42313621e-01 5.52859030e-02]\n", + " [1.40979957e-07 5.60447914e-03 9.94395380e-01]\n", + " [4.57991768e-03 9.78714479e-01 1.67056034e-02]\n", + " [1.07687184e-03 8.47974601e-01 1.50948527e-01]\n", + " [1.55738075e-07 5.44482660e-03 9.94555018e-01]\n", + " [9.84143440e-01 1.58565593e-02 2.21243624e-10]\n", + " [1.96353775e-04 3.77725182e-01 6.22078464e-01]\n", + " [9.90664487e-01 9.33551321e-03 6.98033897e-11]\n", + " [2.52736850e-07 8.46501225e-03 9.91534735e-01]\n", + " [1.95677109e-05 4.08891407e-01 5.91089025e-01]\n", + " [1.72461836e-05 8.83781623e-02 9.11604592e-01]\n", + " [1.09118029e-07 1.18285926e-02 9.88171298e-01]\n", + " [3.31801168e-07 1.03342423e-02 9.89665426e-01]\n", + " [9.86532115e-01 1.34678849e-02 1.68835118e-10]\n", + " [9.80493031e-01 1.95069688e-02 2.80655184e-10]]\n" + ] + } + ], + "source": [ + "with warnings.catch_warnings():\n", + " warnings.simplefilter(\"ignore\")\n", + "\n", + " from sklearn.linear_model import LogisticRegression\n", + " lr_param_grid = {\n", + " 'penalty': ['l1', 'l2', 'elasticnet', 'none'],\n", + " 'C': [0.001, 0.01, 0.1, 1, 10, 100],\n", + " 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],\n", + " 'max_iter': [50, 100, 200, 500],\n", + " }\n", + "\n", + " search = SearchEstimatorList(estimator_list = LogisticRegression(), param_grid_list= lr_param_grid, verbose=0, is_discrete=True)\n", + " search.fit(X_train_cls, y_train_cls)\n", + " print(search.best_model())\n", + " print(search.best_params_)\n", + " y_pred = search.predict(X_test_cls)\n", + "\n", + " mse = mean_squared_error(y_test_cls, y_pred)\n", + "\n", + "print(\"mse of test dataset:\", mse,)\n", + "print(search.predict_proba(X_test_cls))\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Polynomial Feature\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 2 folds for each of 9 candidates, totalling 18 fits\n", + "[CV 1/2] END linear__l1_ratio=0.1, poly__degree=2;, score=0.322 total time= 0.3s\n", + "[CV 2/2] END linear__l1_ratio=0.1, poly__degree=2;, score=0.287 total time= 0.2s\n", + "[CV 1/2] END linear__l1_ratio=0.1, poly__degree=3;, score=0.000 total time= 0.3s\n", + "[CV 2/2] END linear__l1_ratio=0.1, poly__degree=3;, score=0.014 total time= 0.3s\n", + "[CV 1/2] END linear__l1_ratio=0.1, poly__degree=4;, score=0.000 total time= 1.0s\n", + "[CV 2/2] END linear__l1_ratio=0.1, poly__degree=4;, score=-0.000 total time= 1.1s\n", + "[CV 1/2] END linear__l1_ratio=0.5, poly__degree=2;, score=0.322 total time= 0.3s\n", + "[CV 2/2] END linear__l1_ratio=0.5, poly__degree=2;, score=0.287 total time= 0.2s\n", + "[CV 1/2] END linear__l1_ratio=0.5, poly__degree=3;, score=0.000 total time= 0.3s\n", + "[CV 2/2] END linear__l1_ratio=0.5, poly__degree=3;, score=0.014 total time= 0.4s\n", + "[CV 1/2] END linear__l1_ratio=0.5, poly__degree=4;, score=0.000 total time= 1.5s\n", + "[CV 2/2] END linear__l1_ratio=0.5, poly__degree=4;, score=-0.000 total time= 1.3s\n", + "[CV 1/2] END linear__l1_ratio=0.9, poly__degree=2;, score=0.322 total time= 0.2s\n", + "[CV 2/2] END linear__l1_ratio=0.9, poly__degree=2;, score=0.287 total time= 0.2s\n", + "[CV 1/2] END linear__l1_ratio=0.9, poly__degree=3;, score=0.000 total time= 0.3s\n", + "[CV 2/2] END linear__l1_ratio=0.9, poly__degree=3;, score=0.014 total time= 0.4s\n", + "[CV 1/2] END linear__l1_ratio=0.9, poly__degree=4;, score=0.000 total time= 1.1s\n", + "[CV 2/2] END linear__l1_ratio=0.9, poly__degree=4;, score=-0.000 total time= 1.1s\n", + "Best estimator Pipeline(steps=[('poly', PolynomialFeatures()),\n", + " ('linear', ElasticNetCV(l1_ratio=0.9))]) and best score 0.30443941337924607 and best params {'linear__l1_ratio': 0.9, 'poly__degree': 2}\n", + "Mean Squared Error: 0.8894038237145269\n" + ] + } + ], + "source": [ + "with warnings.catch_warnings():\n", + " warnings.simplefilter(\"ignore\")\n", + " # For polynomial, please ensure that you have \"poly__\" (two \"_\" or underscores after poly) underneath to change degree\n", + " # To change the linear method please add \"linear__\" (two \"_\" or underscores after linear)\n", + " param_grid_list_poly = {'poly__degree': [2, 3, 4], 'linear__l1_ratio': [0.1, 0.5, 0.9]}\n", + " sel_reg = SearchEstimatorList(\n", + " estimator_list='poly', \n", + " param_grid_list=param_grid_list_poly,\n", + " is_discrete=False,\n", + " scoring='explained_variance',\n", + " verbose=3\n", + " )\n", + "\n", + " # Fit the model to the training data\n", + " sel_reg.fit(X_train_reg, y_train_reg)\n", + "\n", + " # Predict outcomes for the test set\n", + " predictions_reg = sel_reg.predict(X_test_reg)\n", + "\n", + " # Evaluate the model\n", + " mse = mean_squared_error(y_test_reg, predictions_reg)\n", + "\n", + " # Print the evaluation metric\n", + " print(f\"Mean Squared Error: {mse}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['linear', 'forest', 'gbf', 'nnet', 'poly']\n" + ] + } + ], + "source": [ + "# These are all of the supported models that we have that have built in hyper parameters already included\n", + "print(sel_reg.supported_models)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV 1/2] END .................................., score=-0.518 total time= 0.1s\n", + "[CV 2/2] END .................................., score=-0.552 total time= 0.0s\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV 1/2] END .................................., score=-0.287 total time= 1.3s\n", + "[CV 2/2] END .................................., score=-0.293 total time= 1.3s\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV 1/2] END .................................., score=-0.286 total time= 3.1s\n", + "[CV 2/2] END .................................., score=-0.274 total time= 3.1s\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV 1/2] END .................................., score=-0.305 total time= 3.2s\n", + "[CV 2/2] END .................................., score=-0.305 total time= 3.0s\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV 1/2] END .................................., score=-0.526 total time= 0.6s\n", + "[CV 2/2] END ................................., score=-12.077 total time= 0.5s\n", + "Best estimator RandomForestRegressor() and best score -0.27976201134927425 and best params {}\n", + "Mean Squared Error: 0.2508316133481009\n" + ] + } + ], + "source": [ + "# To try every type of model simply use the \"all\" option\n", + "with warnings.catch_warnings():\n", + " warnings.simplefilter(\"ignore\")\n", + " sel_reg = SearchEstimatorList(\n", + " estimator_list='all', \n", + " param_grid_list=None,\n", + " is_discrete=False,\n", + " scaling=True,\n", + " verbose=5\n", + " )\n", + "\n", + " # Fit the model to the training data\n", + " sel_reg.fit(X_train_reg, y_train_reg)\n", + "\n", + " # Predict outcomes for the test set\n", + " predictions_reg = sel_reg.predict(X_test_reg)\n", + "\n", + " # Evaluate the model\n", + " mse = mean_squared_error(y_test_reg, predictions_reg)\n", + "\n", + " # Print the evaluation metric\n", + " print(f\"Mean Squared Error: {mse}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scoring functions\n", + "\n", + "Using a custom scoring function. See https://scikit-learn.org/stable/modules/model_evaluation.html for how to make your own scoring metric\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV 1/2] END .................................., score=-0.741 total time= 0.0s\n", + "[CV 2/2] END .................................., score=-0.822 total time= 0.0s\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV 1/2] END .................................., score=-2.404 total time= 0.8s\n", + "[CV 2/2] END .................................., score=-1.671 total time= 0.8s\n", + "Best estimator ElasticNetCV() and best score -0.7813657065847333 and best params {}\n", + "Root Mean Squared Error: 0.7490149943228499\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.metrics import make_scorer\n", + "\n", + "def root_mean_squared_error(y_true, y_pred):\n", + " mse = mean_squared_error(y_true, y_pred)\n", + " rmse = np.sqrt(mse)\n", + " return rmse\n", + "loss_function = make_scorer(root_mean_squared_error, greater_is_better=False)\n", + "\n", + "sel_reg = SearchEstimatorList(\n", + " estimator_list=estimator_list_reg, \n", + " param_grid_list=None,\n", + " is_discrete=False,\n", + " scoring=loss_function,\n", + " verbose=3\n", + ")\n", + "\n", + "# Fit the model to the training data\n", + "sel_reg.fit(X_train_reg, y_train_reg)\n", + "\n", + "# Predict outcomes for the test set\n", + "predictions_reg = sel_reg.predict(X_test_reg)\n", + "\n", + "# Evaluate the model\n", + "rmse = root_mean_squared_error(y_test_reg, predictions_reg)\n", + "\n", + "# Print the evaluation metric\n", + "print(f\"Root Mean Squared Error: {rmse}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# What this means for EconML?\n", + "\n", + "By integrating the SearchEstimatorList into econml, we can gain a number of benefits in these categories:\n", + "\n", + " Model Selection: econml contains many different models, each with its own assumptions and use cases. By using SearchEstimatorList, you can more easily compare the performance of different models on your data and select the best one.\n", + "\n", + " Hyperparameter Tuning: Many of the models in econml have hyperparameters that need to be tuned for optimal performance. SearchEstimatorList can automate this process by performing a grid search over specified hyperparameters for each model.\n", + "\n", + " Efficiency: Instead of having to manually train each model and tune its hyperparameters, SearchEstimatorList can do this all at once. This can save a significant amount of time and make the model building process more efficient.\n", + "\n", + "See the example below with data taken fromt he Customer Segmentation at an Online Media Company Notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No scoring value was given. Using default score method neg_mean_squared_error.\n", + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "*** Causal Estimate ***\n", + "\n", + "## Identified estimand\n", + "Estimand type: nonparametric-ate\n", + "\n", + "### Estimand : 1\n", + "Estimand name: backdoor\n", + "Estimand expression:\n", + " d \n", + "────────────(E[log_demand|income,friends_count,days_⟨visited,⟩_hours,age,songs\n", + "d[log_price] \n", + "\n", + " \n", + "_purchased,has_membership,is_US,account_age])\n", + " \n", + "Estimand assumption 1, Unconfoundedness: If U→{log_price} and U→log_demand then P(log_demand|log_price,income,friends_count,days_visited,avg_hours,age,songs_purchased,has_membership,is_US,account_age,U) = P(log_demand|log_price,income,friends_count,days_visited,avg_hours,age,songs_purchased,has_membership,is_US,account_age)\n", + "\n", + "## Realized estimand\n", + "b: log_demand~log_price+income+friends_count+days_visited+avg_hours+age+songs_purchased+has_membership+is_US+account_age | income\n", + "Target units: ate\n", + "\n", + "## Estimate\n", + "Mean value: 2.6518132830256684\n", + "Effect estimates: [ 2.57968831 -0.23224908 4.35502223 ... 0.85234463 -3.53167996\n", + " 6.99294565]\n", + "\n" + ] + } + ], + "source": [ + "# Import the sample pricing data\n", + "file_url = \"https://msalicedatapublic.z5.web.core.windows.net/datasets/Pricing/pricing_sample.csv\"\n", + "train_data = pd.read_csv(file_url)\n", + "\n", + "# Data sample\n", + "train_data.head()\n", + "\n", + "# Define estimator inputs\n", + "train_data[\"log_demand\"] = np.log(train_data[\"demand\"])\n", + "train_data[\"log_price\"] = np.log(train_data[\"price\"])\n", + "\n", + "Y = train_data[\"log_demand\"].values\n", + "T = train_data[\"log_price\"].values\n", + "X = train_data[[\"income\"]].values # features\n", + "confounder_names = [\"account_age\", \"age\", \"avg_hours\", \"days_visited\", \"friends_count\", \"has_membership\", \"is_US\", \"songs_purchased\"]\n", + "W = train_data[confounder_names].values\n", + "\n", + "# Get test data\n", + "X_test = np.linspace(0, 5, 100).reshape(-1, 1)\n", + "X_test_data = pd.DataFrame(X_test, columns=[\"income\"])\n", + "\n", + "# initiate an EconML cate estimator\n", + "est = LinearDML(model_y='gbf', model_t='gbf',\n", + " featurizer=PolynomialFeatures(degree=2, include_bias=False))\n", + "\n", + "# fit through dowhy\n", + "est_dw = est.dowhy.fit(Y, T, X=X, W=W, outcome_names=[\"log_demand\"], treatment_names=[\"log_price\"], feature_names=[\"income\"],\n", + " confounder_names=confounder_names, inference=\"statsmodels\")\n", + "\n", + "lineardml_estimate = est_dw.estimate_\n", + "print(lineardml_estimate)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Define underlying treatment effect function given DGP\n", + "def gamma_fn(X):\n", + " return -3 - 14 * (X[\"income\"] < 1)\n", + "\n", + "def beta_fn(X):\n", + " return 20 + 0.5 * (X[\"avg_hours\"]) + 5 * (X[\"days_visited\"] > 4)\n", + "\n", + "def demand_fn(data, T):\n", + " Y = gamma_fn(data) * T + beta_fn(data)\n", + " return Y\n", + "\n", + "def true_te(x, n, stats):\n", + " if x < 1:\n", + " subdata = train_data[train_data[\"income\"] < 1].sample(n=n, replace=True)\n", + " else:\n", + " subdata = train_data[train_data[\"income\"] >= 1].sample(n=n, replace=True)\n", + " te_array = subdata[\"price\"] * gamma_fn(subdata) / (subdata[\"demand\"])\n", + " if stats == \"mean\":\n", + " return np.mean(te_array)\n", + " elif stats == \"median\":\n", + " return np.median(te_array)\n", + " elif isinstance(stats, int):\n", + " return np.percentile(te_array, stats)\n", + "\n", + "# Get the estimate and range of true treatment effect\n", + "truth_te_estimate = np.apply_along_axis(true_te, 1, X_test, 1000, \"mean\") # estimate\n", + "truth_te_upper = np.apply_along_axis(true_te, 1, X_test, 1000, 95) # upper level\n", + "truth_te_lower = np.apply_along_axis(true_te, 1, X_test, 1000, 5) # lower level\n", + "\n", + "te_pred = est_dw.effect(X_test).flatten()\n", + "te_pred_interval = est_dw.effect_interval(X_test)\n", + "\n", + "# Compare the estimate and the truth\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(X_test.flatten(), te_pred, label=\"Sales Elasticity Prediction\")\n", + "plt.plot(X_test.flatten(), truth_te_estimate, \"--\", label=\"True Elasticity\")\n", + "plt.fill_between(\n", + " X_test.flatten(),\n", + " te_pred_interval[0].flatten(),\n", + " te_pred_interval[1].flatten(),\n", + " alpha=0.2,\n", + " label=\"95% Confidence Interval\",\n", + ")\n", + "plt.fill_between(\n", + " X_test.flatten(),\n", + " truth_te_lower,\n", + " truth_te_upper,\n", + " alpha=0.2,\n", + " label=\"True Elasticity Range\",\n", + ")\n", + "plt.xlabel(\"Income\")\n", + "plt.ylabel(\"Songs Sales Elasticity\")\n", + "plt.title(\"Songs Sales Elasticity vs Income\")\n", + "plt.legend(loc=\"lower right\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No scoring value was given. Using default score method neg_mean_squared_error.\n", + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing estimator: RandomForestRegressor\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV] END .................................................... total time= 1.1s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] END .................................................... total time= 0.7s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing estimator: MLPRegressor\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] END .................................................... total time= 0.3s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] END .................................................... total time= 0.4s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best estimator RandomForestRegressor() and best score -0.007087413279468611 and best params {}\n", + "Processing estimator: RandomForestRegressor\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV] END .................................................... total time= 2.3s\n", + "[CV] END .................................................... total time= 2.3s\n", + "Processing estimator: MLPRegressor\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV] END .................................................... total time= 12.6s\n", + "[CV] END .................................................... total time= 10.5s\n", + "Best estimator RandomForestRegressor() and best score -0.015753967716546576 and best params {}\n", + "Processing estimator: RandomForestRegressor\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] END .................................................... total time= 0.7s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] END .................................................... total time= 0.7s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing estimator: MLPRegressor\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV] END .................................................... total time= 0.2s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] END .................................................... total time= 0.3s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best estimator RandomForestRegressor() and best score -0.006845612318994855 and best params {}\n", + "Processing estimator: RandomForestRegressor\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV] END .................................................... total time= 2.2s\n", + "[CV] END .................................................... total time= 2.1s\n", + "Processing estimator: MLPRegressor\n", + "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", + "[CV] END .................................................... total time= 12.2s\n", + "[CV] END .................................................... total time= 14.3s\n", + "Best estimator RandomForestRegressor() and best score -0.014455828883075759 and best params {}\n", + "*** Causal Estimate ***\n", + "\n", + "## Identified estimand\n", + "Estimand type: nonparametric-ate\n", + "\n", + "### Estimand : 1\n", + "Estimand name: backdoor\n", + "Estimand expression:\n", + " d \n", + "────────────(E[log_demand|income,friends_count,days_⟨visited,⟩_hours,age,songs\n", + "d[log_price] \n", + "\n", + " \n", + "_purchased,has_membership,is_US,account_age])\n", + " \n", + "Estimand assumption 1, Unconfoundedness: If U→{log_price} and U→log_demand then P(log_demand|log_price,income,friends_count,days_visited,avg_hours,age,songs_purchased,has_membership,is_US,account_age,U) = P(log_demand|log_price,income,friends_count,days_visited,avg_hours,age,songs_purchased,has_membership,is_US,account_age)\n", + "\n", + "## Realized estimand\n", + "b: log_demand~log_price+income+friends_count+days_visited+avg_hours+age+songs_purchased+has_membership+is_US+account_age | income\n", + "Target units: ate\n", + "\n", + "## Estimate\n", + "Mean value: -0.9764341213588181\n", + "Effect estimates: [-1.06939218 -1.44817143 -0.81689907 ... -1.30445479 -1.87209822\n", + " -0.40427838]\n", + "\n" + ] + } + ], + "source": [ + "# initiate an EconML cate estimator\n", + "\n", + "est = LinearDML(model_y=['forest', 'nnet'], model_t=['nnet', 'forest'], scaling=False,\n", + " featurizer=PolynomialFeatures(degree=2, include_bias=False))\n", + "\n", + "# fit through dowhy\n", + "est_dw = est.dowhy.fit(Y, T, X=X, W=W, outcome_names=[\"log_demand\"], treatment_names=[\"log_price\"], feature_names=[\"income\"],\n", + " confounder_names=confounder_names, inference=\"statsmodels\")\n", + "\n", + "lineardml_estimate = est_dw.estimate_\n", + "print(lineardml_estimate)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "te_pred = est_dw.effect(X_test).flatten()\n", + "te_pred_interval = est_dw.effect_interval(X_test)\n", + "\n", + "# Compare the estimate and the truth\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(X_test.flatten(), te_pred, label=\"Sales Elasticity Prediction\")\n", + "plt.plot(X_test.flatten(), truth_te_estimate, \"--\", label=\"True Elasticity\")\n", + "plt.fill_between(\n", + " X_test.flatten(),\n", + " te_pred_interval[0].flatten(),\n", + " te_pred_interval[1].flatten(),\n", + " alpha=0.2,\n", + " label=\"95% Confidence Interval\",\n", + ")\n", + "plt.fill_between(\n", + " X_test.flatten(),\n", + " truth_te_lower,\n", + " truth_te_upper,\n", + " alpha=0.2,\n", + " label=\"True Elasticity Range\",\n", + ")\n", + "plt.xlabel(\"Income\")\n", + "plt.ylabel(\"Songs Sales Elasticity\")\n", + "plt.title(\"Songs Sales Elasticity vs Income\")\n", + "plt.legend(loc=\"lower right\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From fe1c5e1493b7b421c7fcc31b48bd7eadba101eab Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Mon, 6 Nov 2023 09:43:52 -0500 Subject: [PATCH 04/19] Model selection WIP Signed-off-by: Keith Battocchi --- econml/_ortho_learner.py | 40 +- econml/dml/_rlearner.py | 33 +- econml/dml/causal_forest.py | 18 +- econml/dml/dml.py | 150 +-- econml/dr/_drlearner.py | 49 +- econml/iv/dml/_dml.py | 210 ++-- econml/iv/dr/_dr.py | 235 ++-- econml/new_tests/test_model_selection.py | 267 ----- .../new_tests/test_model_selection_utils.py | 235 ---- econml/panel/dml/_dml.py | 46 +- econml/sklearn_extensions/linear_model.py | 108 +- econml/sklearn_extensions/model_selection.py | 382 +++++- .../model_selection_utils.py | 39 +- econml/tests/test_dml.py | 26 +- econml/tests/test_dmliv.py | 8 +- econml/tests/test_driv.py | 27 +- econml/tests/test_drlearner.py | 13 +- econml/tests/test_missing_values.py | 2 +- econml/tests/test_ortho_learner.py | 14 +- econml/tests/test_refit.py | 12 +- econml/tests/utilities.py | 17 +- econml/utilities.py | 74 -- .../SearchEstimatorList functionality.ipynb | 1031 ----------------- 23 files changed, 780 insertions(+), 2256 deletions(-) delete mode 100644 econml/new_tests/test_model_selection.py delete mode 100644 econml/new_tests/test_model_selection_utils.py delete mode 100644 notebooks/SearchEstimatorList functionality.ipynb diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py index 15d7b7af3..270fd5d84 100644 --- a/econml/_ortho_learner.py +++ b/econml/_ortho_learner.py @@ -45,6 +45,7 @@ class in this module implements the general logic in a very versatile way from .utilities import (_deprecate_positional, check_input_arrays, cross_product, filter_none_kwargs, inverse_onehot, jacify_featurizer, ndim, reshape, shape, transpose) +from .sklearn_extensions.model_selection import ModelSelector try: import ray @@ -100,7 +101,7 @@ def _fit_fold(model, train_idxs, test_idxs, calculate_scores, args, kwargs): kwargs_train = {key: var[train_idxs] for key, var in kwargs.items()} kwargs_test = {key: var[test_idxs] for key, var in kwargs.items()} - model.fit(*args_train, **kwargs_train) + model.train(False, *args_train, **kwargs_train) nuisance_temp = model.predict(*args_test, **kwargs_test) if not isinstance(nuisance_temp, tuple): @@ -115,17 +116,18 @@ def _fit_fold(model, train_idxs, test_idxs, calculate_scores, args, kwargs): return nuisance_temp, model, test_idxs, (score_temp if calculate_scores else None) -def _crossfit(model, folds, use_ray, ray_remote_fun_option, *args, **kwargs): +def _crossfit(model: ModelSelector, folds, use_ray, ray_remote_fun_option, *args, **kwargs): """ General crossfit based calculation of nuisance parameters. Parameters ---------- - model : object - An object that supports fit and predict. Fit must accept all the args - and the keyword arguments kwargs. Similarly predict must all accept - all the args as arguments and kwards as keyword arguments. The fit - function estimates a model of the nuisance function, based on the input + model : ModelSelector + An object that has train and predict methods. + The train method must take an 'is_selecting' argument first, and then + accept positional arguments `args` and keyword arguments `kwargs`; the predict method + just takes those `args` and `kwargs`. The train + method selects or estimates a model of the nuisance function, based on the input data to fit. Predict evaluates the fitted nuisance function on the input data to predict. folds : list of tuple or None @@ -177,7 +179,7 @@ def _crossfit(model, folds, use_ray, ray_remote_fun_option, *args, **kwargs): class Wrapper: def __init__(self, model): self._model = model - def fit(self, X, y, W=None): + def fit(self, is_selecting, X, y, W=None): self._model.fit(X, y) return self def predict(self, X, y, W=None): @@ -202,13 +204,17 @@ def predict(self, X, y, W=None): """ model_list = [] + + kwargs = filter_none_kwargs(**kwargs) + model.train(True, *args, **kwargs) + calculate_scores = hasattr(model, 'score') # remove None arguments - kwargs = filter_none_kwargs(**kwargs) if folds is None: # skip crossfitting model_list.append(clone(model, safe=False)) - model_list[0].fit(*args, **kwargs) + model_list[0].train(True, *args, **kwargs) + model_list[0].train(False, *args, **kwargs) # fit the selected model nuisances = model_list[0].predict(*args, **kwargs) scores = model_list[0].score(*args, **kwargs) if calculate_scores else None @@ -394,7 +400,7 @@ class ModelNuisance: def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self @@ -448,7 +454,7 @@ class ModelNuisance: def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, np.matmul(T, np.arange(1, T.shape[1]+1))) self._model_y.fit(W, Y) return self @@ -532,15 +538,15 @@ def _gen_allowed_missing_vars(self): @abstractmethod def _gen_ortho_learner_model_nuisance(self): - """ Must return a fresh instance of a nuisance model + """Must return a fresh instance of a nuisance model selector Returns ------- - model_nuisance: estimator - The estimator for fitting the nuisance function. Must implement - `fit` and `predict` methods that both have signatures:: + model_nuisance: selector + The selector for fitting the nuisance function. The returned estimator must implement + `train` and `predict` methods that both have signatures:: - model_nuisance.fit(Y, T, X=X, W=W, Z=Z, + model_nuisance.train(is_selecting, Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight) model_nuisance.predict(Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight) diff --git a/econml/dml/_rlearner.py b/econml/dml/_rlearner.py index bd645fda3..b1bc9e2ad 100644 --- a/econml/dml/_rlearner.py +++ b/econml/dml/_rlearner.py @@ -29,40 +29,35 @@ import numpy as np import copy from warnings import warn + +from ..sklearn_extensions.model_selection import ModelSelector from ..utilities import (shape, reshape, ndim, hstack, filter_none_kwargs, _deprecate_positional) from sklearn.linear_model import LinearRegression from sklearn.base import clone from .._ortho_learner import _OrthoLearner -class _ModelNuisance: +class _ModelNuisance(ModelSelector): """ Nuisance model fits the model_y and model_t at fit time and at predict time calculates the residual Y and residual T based on the fitted models and returns the residuals as two nuisance parameters. """ - def __init__(self, model_y, model_t): + def __init__(self, model_y: ModelSelector, model_t: ModelSelector): self._model_y = model_y self._model_t = model_t - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): assert Z is None, "Cannot accept instrument!" - self._model_t.fit(X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) - self._model_y.fit(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + self._model_t.train(is_selecting, X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + self._model_y.train(is_selecting, X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) return self def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): - if hasattr(self._model_y, 'score'): - # note that groups are not passed to score because they are only used for fitting - Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight)) - else: - Y_score = None - if hasattr(self._model_t, 'score'): - # note that groups are not passed to score because they are only used for fitting - T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight)) - else: - T_score = None + # note that groups are not passed to score because they are only used for fitting + T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight)) + Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight)) return Y_score, T_score def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): @@ -302,7 +297,7 @@ def _gen_model_y(self): """ Returns ------- - model_y: estimator of E[Y | X, W] + model_y: selector for the estimator of E[Y | X, W] The estimator for fitting the response to the features and controls. Must implement `fit` and `predict` methods. Unlike sklearn estimators both methods must take an extra second argument (the controls), i.e. :: @@ -317,7 +312,7 @@ def _gen_model_t(self): """ Returns ------- - model_t: estimator of E[T | X, W] + model_t: selector for the estimator of E[T | X, W] The estimator for fitting the treatment to the features and controls. Must implement `fit` and `predict` methods. Unlike sklearn estimators both methods must take an extra second argument (the controls), i.e. :: @@ -432,11 +427,11 @@ def rlearner_model_final_(self): @property def models_y(self): - return [[mdl._model_y for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_y.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t(self): - return [[mdl._model_t for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def nuisance_scores_y(self): diff --git a/econml/dml/causal_forest.py b/econml/dml/causal_forest.py index 4f038eb3f..757b498ef 100644 --- a/econml/dml/causal_forest.py +++ b/econml/dml/causal_forest.py @@ -11,7 +11,7 @@ from sklearn.model_selection import train_test_split from itertools import product from .dml import _BaseDML -from .dml import _FirstStageWrapper +from .dml import _make_first_stage_selector from ..sklearn_extensions.linear_model import WeightedLassoCVWrapper from ..sklearn_extensions.model_selection import WeightedStratifiedKFold from ..inference import NormalInferenceResults @@ -668,22 +668,10 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y(self): - if self.model_y == 'auto': - model_y = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y = clone(self.model_y, safe=False) - return _FirstStageWrapper(model_y, True, self._gen_featurizer(), False, self.discrete_treatment) + return _make_first_stage_selector(self.model_y, False, self.random_state) def _gen_model_t(self): - if self.model_t == 'auto': - if self.discrete_treatment: - model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t = clone(self.model_t, safe=False) - return _FirstStageWrapper(model_t, False, self._gen_featurizer(), False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t, self.discrete_treatment, self.random_state) def _gen_model_final(self): return MultiOutputGRF(CausalForest(n_estimators=self.n_estimators, diff --git a/econml/dml/dml.py b/econml/dml/dml.py index d7c59013b..caa12e0c2 100644 --- a/econml/dml/dml.py +++ b/econml/dml/dml.py @@ -29,76 +29,85 @@ from ..sklearn_extensions.model_selection import WeightedStratifiedKFold from ..utilities import (_deprecate_positional, add_intercept, broadcast_unit_treatments, check_high_dimensional, - cross_product, deprecated, fit_with_groups, + cross_product, deprecated, hstack, inverse_onehot, ndim, reshape, reshape_treatmentwise_effects, shape, transpose, get_feature_names_or_default, filter_none_kwargs) from .._shap import _shap_explain_model_cate -from ..sklearn_extensions.model_selection import SearchEstimatorList -import pdb +from ..sklearn_extensions.model_selection import get_selector, ModelSelector, SingleModelSelector -class _FirstStageWrapper: - def __init__(self, model, is_Y, featurizer, linear_first_stages, discrete_treatment): - self._model = clone(model, safe=False) - self._featurizer = clone(featurizer, safe=False) - self._is_Y = is_Y - self._linear_first_stages = linear_first_stages - self._discrete_treatment = discrete_treatment - - def _combine(self, X, W, n_samples, fitting=True): - if X is None: - # if both X and W are None, just return a column of ones - return (W if W is not None else np.ones((n_samples, 1))) - XW = hstack([X, W]) if W is not None else X - if self._is_Y and self._linear_first_stages: - if self._featurizer is None: - F = X - else: - F = self._featurizer.fit_transform(X) if fitting else self._featurizer.transform(X) - return cross_product(XW, hstack([np.ones((shape(XW)[0], 1)), F])) - else: - return XW +def _combine(X, W, n_samples): + if X is None: + # if both X and W are None, just return a column of ones + return (W if W is not None else np.ones((n_samples, 1))) + return hstack([X, W]) if W is not None else X - def fit(self, X, W, Target, sample_weight=None, groups=None): - if (not self._is_Y) and self._discrete_treatment: - # In this case, the Target is the one-hot-encoding of the treatment variable - # We need to go back to the label representation of the one-hot so as to call - # the classifier. - if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))): - raise AttributeError("Provided crossfit folds contain training splits that " + - "don't contain all treatments") - Target = inverse_onehot(Target) - if sample_weight is not None: - fit_with_groups(self._model, self._combine(X, W, Target.shape[0]), Target, groups=groups, - sample_weight=sample_weight) - else: - fit_with_groups(self._model, self._combine(X, W, Target.shape[0]), Target, groups=groups) - return self +class _FirstStageWrapper: + def __init__(self, model, discrete_target): + self._model = model # plain sklearn-compatible model, not a ModelSelector + self._discrete_target = discrete_target def predict(self, X, W): n_samples = X.shape[0] if X is not None else (W.shape[0] if W is not None else 1) - if (not self._is_Y) and self._discrete_treatment: - return self._model.predict_proba(self._combine(X, W, n_samples, fitting=False))[:, 1:] + if self._discrete_target: + return self._model.predict_proba(_combine(X, W, n_samples))[:, 1:] else: - return self._model.predict(self._combine(X, W, n_samples, fitting=False)) + return self._model.predict(_combine(X, W, n_samples)) def score(self, X, W, Target, sample_weight=None): if hasattr(self._model, 'score'): - if (not self._is_Y) and self._discrete_treatment: + if self._discrete_target: # In this case, the Target is the one-hot-encoding of the treatment variable # We need to go back to the label representation of the one-hot so as to call # the classifier. Target = inverse_onehot(Target) if sample_weight is not None: - return self._model.score(self._combine(X, W, Target.shape[0]), Target, sample_weight=sample_weight) + return self._model.score(_combine(X, W, Target.shape[0]), Target, sample_weight=sample_weight) else: - return self._model.score(self._combine(X, W, Target.shape[0]), Target) + return self._model.score(_combine(X, W, Target.shape[0]), Target) else: return None +class _FirstStageSelector(SingleModelSelector): + def __init__(self, model: SingleModelSelector, discrete_target): + self._model = clone(model, safe=False) + self._discrete_target = discrete_target + + def train(self, is_selecting, X, W, Target, sample_weight=None, groups=None): + if self._discrete_target: + # In this case, the Target is the one-hot-encoding of the treatment variable + # We need to go back to the label representation of the one-hot so as to call + # the classifier. + if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))): + raise AttributeError("Provided crossfit folds contain training splits that " + + "don't contain all treatments") + Target = inverse_onehot(Target) + + self._model.train(is_selecting, _combine(X, W, Target.shape[0]), Target, + **filter_none_kwargs(groups=groups, sample_weight=sample_weight)) + return self + + @property + def best_model(self): + return _FirstStageWrapper(self._model.best_model, self._discrete_target) + + @property + def best_score(self): + return self._model.best_score + + +def _make_first_stage_selector(model, is_discrete, random_state): + if model == 'auto': + model = ['forest', 'linear'] + return _FirstStageSelector(get_selector(model, + is_discrete=is_discrete, + random_state=random_state), + discrete_target=is_discrete) + + class _FinalWrapper: def __init__(self, model_final, fit_cate_intercept, featurizer, use_weight_trick): self._model = clone(model_final, safe=False) @@ -359,7 +368,7 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn `fit` and `predict` methods, and must be a linear model for correctness. param_list: list or 'auto', default 'auto' - The list of parameters to be used during cross-validation. + The list of parameters to be used during cross-validation. If 'auto', it will be chosen based on the model type. scaling: bool, default True @@ -538,45 +547,11 @@ def _gen_allowed_missing_vars(self): def _gen_featurizer(self): return clone(self.featurizer, safe=False) - def _gen_model_y(self): # New - if self.model_y == 'auto': - model_y = SearchEstimatorList(estimator_list=WeightedLassoCVWrapper(random_state=self.random_state), param_grid_list=self.param_list_y, scoring=self.scoring_y, - scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state) - else: - model_y = clone(SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y, scoring=self.scoring_y, - scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state), safe=False) - # if self.model_y == 'auto': - # model_y = WeightedLassoCVWrapper(random_state=self.random_state) - # else: - # model_y = clone(self.model_y, safe=False) - return _FirstStageWrapper(model_y, True, self._gen_featurizer(), - self.linear_first_stages, self.discrete_treatment) - - def _gen_model_t(self): # New - if self.model_t == 'auto': - if self.discrete_treatment: - model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, scoring=self.scoring_t, - scaling=self.scaling, verbose=self.verbose, cv=WeightedStratifiedKFold(random_state=self.random_state), is_discrete=self.discrete_treatment, - n_jobs=self.n_jobs, random_state=self.random_state) - else: - model_t = SearchEstimatorList(estimator_list=WeightedLassoCVWrapper(random_state=self.random_state), param_grid_list=self.param_list_t, scoring=self.scoring_t, - scaling=self.scaling, verbose=self.verbose, cv=self.cv, is_discrete=self.discrete_treatment, - n_jobs=self.n_jobs, random_state=self.random_state) + def _gen_model_y(self): + return _make_first_stage_selector(self.model_y, False, self.random_state) - else: - model_t = clone(SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, - scaling=self.scaling, verbose=self.verbose, cv=self.cv, is_discrete=self.discrete_treatment, - n_jobs=self.n_jobs, random_state=self.random_state), safe=False) - # if self.model_t == 'auto': - # if self.discrete_treatment: - # model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - # random_state=self.random_state) - # else: - # model_t = WeightedLassoCVWrapper(random_state=self.random_state) - # else: - # model_t = clone(self.model_t, safe=False) - return _FirstStageWrapper(model_t, False, self._gen_featurizer(), - self.linear_first_stages, self.discrete_treatment) + def _gen_model_t(self): + return _make_first_stage_selector(self.model_t, self.discrete_treatment, self.random_state) def _gen_model_final(self): return clone(self.model_final, safe=False) @@ -1520,12 +1495,11 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y(self): - return _FirstStageWrapper(clone(self.model_y, safe=False), True, - self._gen_featurizer(), False, self.discrete_treatment) + return _make_first_stage_selector(self.model_y, is_discrete=False, random_state=self.random_state) def _gen_model_t(self): - return _FirstStageWrapper(clone(self.model_t, safe=False), False, - self._gen_featurizer(), False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t, is_discrete=self.discrete_treatment, + random_state=self.random_state) def _gen_model_final(self): return clone(self.model_final, safe=False) diff --git a/econml/dr/_drlearner.py b/econml/dr/_drlearner.py index 9b75ca75d..1f74890e0 100644 --- a/econml/dr/_drlearner.py +++ b/econml/dr/_drlearner.py @@ -43,6 +43,7 @@ LogisticRegressionCV) from sklearn.ensemble import RandomForestRegressor + from .._ortho_learner import _OrthoLearner from .._cate_estimator import (DebiasedLassoCateEstimatorDiscreteMixin, BaseCateEstimator, ForestModelFinalCateEstimatorDiscreteMixin, @@ -51,13 +52,17 @@ from ..grf import RegressionForest from ..sklearn_extensions.linear_model import ( DebiasedLasso, StatsModelsLinearRegression, WeightedLassoCVWrapper) +from ..sklearn_extensions.model_selection import ModelSelector, SingleModelSelector, get_selector from ..utilities import (_deprecate_positional, check_high_dimensional, - filter_none_kwargs, fit_with_groups, inverse_onehot, get_feature_names_or_default) + filter_none_kwargs, inverse_onehot, get_feature_names_or_default) from .._shap import _shap_explain_multitask_model_cate, _shap_explain_model_cate -class _ModelNuisance: - def __init__(self, model_propensity, model_regression, min_propensity): +class _ModelNuisance(ModelSelector): + def __init__(self, + model_propensity: SingleModelSelector, + model_regression: SingleModelSelector, + min_propensity): self._model_propensity = model_propensity self._model_regression = model_regression self._min_propensity = min_propensity @@ -65,7 +70,7 @@ def __init__(self, model_propensity, model_regression, min_propensity): def _combine(self, X, W): return np.hstack([arr for arr in [X, W] if arr is not None]) - def fit(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None): + def train(self, is_selecting, Y, T, X=None, W=None, *, sample_weight=None, groups=None): if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1): raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), " "instead got {1}.".format(len(X), Y.shape)) @@ -77,22 +82,16 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None): XW = self._combine(X, W) filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight) - fit_with_groups(self._model_propensity, XW, inverse_onehot(T), groups=groups, **filtered_kwargs) - fit_with_groups(self._model_regression, np.hstack([XW, T]), Y, groups=groups, **filtered_kwargs) + self._model_propensity.train(is_selecting, XW, inverse_onehot(T), groups=groups, **filtered_kwargs) + self._model_regression.train(is_selecting, np.hstack([XW, T]), Y, groups=groups, **filtered_kwargs) return self def score(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None): XW = self._combine(X, W) filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight) - if hasattr(self._model_propensity, 'score'): - propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs) - else: - propensity_score = None - if hasattr(self._model_regression, 'score'): - regression_score = self._model_regression.score(np.hstack([XW, T]), Y, **filtered_kwargs) - else: - regression_score = None + propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs) + regression_score = self._model_regression.score(np.hstack([XW, T]), Y, **filtered_kwargs) return propensity_score, regression_score @@ -114,6 +113,12 @@ def predict(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None): return Y_pred.reshape(Y.shape + (T.shape[1] + 1,)), propensities_weight.reshape((n,)) +def _make_first_stage_selector(model, is_discrete, random_state): + if model == "auto": + model = ['linear', 'forest'] + return get_selector(model, is_discrete=is_discrete, random_state=random_state) + + class _ModelFinal: # Coding Remark: The reasoning around the multitask_model_final could have been simplified if # we simply wrapped the model_final with a MultiOutputRegressor. However, because we also want @@ -499,16 +504,8 @@ def _get_inference_options(self): return options def _gen_ortho_learner_model_nuisance(self): - if self.model_propensity == 'auto': - model_propensity = LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto', - random_state=self.random_state) - else: - model_propensity = clone(self.model_propensity, safe=False) - - if self.model_regression == 'auto': - model_regression = WeightedLassoCVWrapper(cv=3, random_state=self.random_state) - else: - model_regression = clone(self.model_regression, safe=False) + model_propensity = _make_first_stage_selector(self.model_propensity, True, self.random_state) + model_regression = _make_first_stage_selector(self.model_regression, False, self.random_state) return _ModelNuisance(model_propensity, model_regression, self.min_propensity) @@ -648,7 +645,7 @@ def models_propensity(self): monte carlo iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_propensity for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_propensity.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_regression(self): @@ -662,7 +659,7 @@ def models_regression(self): monte carlo iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_regression for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_regression.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def nuisance_scores_propensity(self): diff --git a/econml/iv/dml/_dml.py b/econml/iv/dml/_dml.py index c8889599f..1cddcc247 100644 --- a/econml/iv/dml/_dml.py +++ b/econml/iv/dml/_dml.py @@ -24,17 +24,30 @@ from ..._cate_estimator import LinearModelFinalCateEstimatorMixin, StatsModelsCateEstimatorMixin, LinearCateEstimator from ...inference import StatsModelsInference, GenericSingleTreatmentModelFinalInference from ...sklearn_extensions.linear_model import StatsModels2SLS, StatsModelsLinearRegression, WeightedLassoCVWrapper -from ...sklearn_extensions.model_selection import WeightedStratifiedKFold +from ...sklearn_extensions.model_selection import (ModelSelector, SingleModelSelector, + WeightedStratifiedKFold, get_selector) from ...utilities import (_deprecate_positional, get_feature_names_or_default, filter_none_kwargs, add_intercept, cross_product, broadcast_unit_treatments, reshape_treatmentwise_effects, shape, parse_final_model_params, deprecated, Summary) -from ...dml.dml import _FirstStageWrapper, _FinalWrapper +from ...dml.dml import _make_first_stage_selector, _FinalWrapper from ...dml._rlearner import _ModelFinal from ..._shap import _shap_explain_joint_linear_model_cate, _shap_explain_model_cate -class _OrthoIVModelNuisance: - def __init__(self, model_y_xw, model_t_xw, model_z, projection): +def _combine(W, Z, n_samples): + if Z is not None: + Z = Z.reshape(n_samples, -1) + return Z if W is None else np.hstack([W, Z]) + return None if W is None else W + + +class _OrthoIVNuisanceSelector(ModelSelector): + + def __init__(self, + model_y_xw: SingleModelSelector, + model_t_xw: SingleModelSelector, + model_z: SingleModelSelector, + projection): self._model_y_xw = model_y_xw self._model_t_xw = model_t_xw self._projection = projection @@ -43,21 +56,15 @@ def __init__(self, model_y_xw, model_t_xw, model_z, projection): else: self._model_z_xw = model_z - def _combine(self, W, Z, n_samples): - if Z is not None: - Z = Z.reshape(n_samples, -1) - return Z if W is None else np.hstack([W, Z]) - return None if W is None else W - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): - self._model_y_xw.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) - self._model_t_xw.fit(X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + self._model_y_xw.train(is_selecting, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) + self._model_t_xw.train(is_selecting, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) if self._projection: # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) - self._model_t_xwz.fit(X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) + WZ = _combine(W, Z, Y.shape[0]) + self._model_t_xwz.train(is_selecting, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) else: - self._model_z_xw.fit(X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) + self._model_z_xw.train(is_selecting, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) return self def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): @@ -71,7 +78,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): T_X_score = None if self._projection: # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) if hasattr(self._model_t_xwz, 'score'): T_XZ_score = self._model_t_xwz.score(X=X, W=WZ, Target=T, sample_weight=sample_weight) else: @@ -91,7 +98,7 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None) if self._projection: # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) T_proj = self._model_t_xwz.predict(X, WZ) else: Z_pred = self._model_z_xw.predict(X=X, W=W) @@ -387,57 +394,29 @@ def _gen_ortho_learner_model_final(self): return _OrthoIVModelFinal(self._gen_model_final(), self._gen_featurizer(), self.fit_cate_intercept) def _gen_ortho_learner_model_nuisance(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) + model_y = _make_first_stage_selector(self.model_y_xw, + is_discrete=False, + random_state=self.random_state) - if self.model_t_xw == 'auto': - if self.discrete_treatment: - model_t_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xw = clone(self.model_t_xw, safe=False) + model_t = _make_first_stage_selector(self.model_t_xw, + is_discrete=self.discrete_treatment, + random_state=self.random_state) if self.projection: # train E[T|X,W,Z] - if self.model_t_xwz == 'auto': - if self.discrete_treatment: - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) - - return _OrthoIVModelNuisance(_FirstStageWrapper(clone(model_y_xw, safe=False), True, - self._gen_featurizer(), False, False), - _FirstStageWrapper(clone(model_t_xw, safe=False), False, - self._gen_featurizer(), False, self.discrete_treatment), - _FirstStageWrapper(clone(model_t_xwz, safe=False), False, - self._gen_featurizer(), False, self.discrete_treatment), - self.projection) + model_z = _make_first_stage_selector(self.model_t_xwz, + is_discrete=self.discrete_treatment, + random_state=self.random_state) else: - # train [Z|X,W] - if self.model_z_xw == "auto": - if self.discrete_instrument: - model_z_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_z_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_z_xw = clone(self.model_z_xw, safe=False) + # train E[Z|X,W] + # note: discrete_instrument rather than discrete_treatment in call to _make_first_stage_selector + model_z = _make_first_stage_selector(self.model_z_xw, + is_discrete=self.discrete_instrument, + random_state=self.random_state) - return _OrthoIVModelNuisance(_FirstStageWrapper(clone(model_y_xw, safe=False), True, - self._gen_featurizer(), False, False), - _FirstStageWrapper(clone(model_t_xw, safe=False), False, - self._gen_featurizer(), False, self.discrete_treatment), - _FirstStageWrapper(clone(model_z_xw, safe=False), False, - self._gen_featurizer(), False, self.discrete_instrument), - self.projection) + return _OrthoIVNuisanceSelector(model_y, model_t, model_z, + self.projection) def fit(self, Y, T, *, Z, X=None, W=None, sample_weight=None, freq_weight=None, sample_var=None, groups=None, cache_values=False, inference="auto"): @@ -604,7 +583,7 @@ def models_y_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_y_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_y_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t_xw(self): @@ -618,7 +597,7 @@ def models_t_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_t_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_z_xw(self): @@ -634,7 +613,7 @@ def models_z_xw(self): """ if self.projection: raise AttributeError("Projection model is fitted for instrument! Use models_t_xwz.") - return [[mdl._model_z_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_z_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t_xwz(self): @@ -650,7 +629,7 @@ def models_t_xwz(self): """ if not self.projection: raise AttributeError("Direct model is fitted for instrument! Use models_z_xw.") - return [[mdl._model_t_xwz._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t_xwz.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def nuisance_scores_y_xw(self): @@ -717,29 +696,24 @@ def residuals_(self): return Y_res, T_res, Z_res, self._cached_values.X, self._cached_values.W, self._cached_values.Z -class _BaseDMLIVModelNuisance: +class _BaseDMLIVNuisanceSelector(ModelSelector): """ Nuisance model fits the three models at fit time and at predict time returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals. """ - def __init__(self, model_y_xw, model_t_xw, model_t_xwz): - self._model_y_xw = clone(model_y_xw, safe=False) - self._model_t_xw = clone(model_t_xw, safe=False) - self._model_t_xwz = clone(model_t_xwz, safe=False) - - def _combine(self, W, Z, n_samples): - if Z is not None: - Z = Z.reshape(n_samples, -1) - return Z if W is None else np.hstack([W, Z]) - return None if W is None else W + def __init__(self, model_y_xw: ModelSelector, model_t_xw: ModelSelector, model_t_xwz: ModelSelector): + self._model_y_xw = model_y_xw + self._model_t_xw = model_t_xw + self._model_t_xwz = model_t_xwz - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): - self._model_y_xw.fit(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) - self._model_t_xw.fit(X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + self._model_y_xw.train(is_selecting, X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + self._model_t_xw.train(is_selecting, X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) - self._model_t_xwz.fit(X, WZ, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + WZ = _combine(W, Z, Y.shape[0]) + self._model_t_xwz.train(is_selecting, X, WZ, T, + **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) return self def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): @@ -754,7 +728,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): T_X_score = None if hasattr(self._model_t_xwz, 'score'): # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) T_XZ_score = self._model_t_xwz.score(X, WZ, T, **filter_none_kwargs(sample_weight=sample_weight)) else: T_XZ_score = None @@ -764,7 +738,7 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None) # note that sample_weight and groups are not passed to predict because they are only used for fitting Y_pred = self._model_y_xw.predict(X, W) # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) TXZ_pred = self._model_t_xwz.predict(X, WZ) TX_pred = self._model_t_xw.predict(X, W) if (X is None) and (W is None): # In this case predict above returns a single row @@ -909,7 +883,7 @@ def models_y_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_y_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_y_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t_xw(self): @@ -923,7 +897,7 @@ def models_t_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_t_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t_xwz(self): @@ -937,7 +911,7 @@ def models_t_xwz(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_t_xwz._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t_xwz.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def nuisance_scores_y_xw(self): @@ -1183,42 +1157,19 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y_xw(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) - return _FirstStageWrapper(model_y_xw, True, self._gen_featurizer(), - False, False) + return _make_first_stage_selector(self.model_y_xw, False, self.random_state) def _gen_model_t_xw(self): - if self.model_t_xw == 'auto': - if self.discrete_treatment: - model_t_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xw = clone(self.model_t_xw, safe=False) - return _FirstStageWrapper(model_t_xw, False, self._gen_featurizer(), - False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t_xw, self.discrete_treatment, self.random_state) def _gen_model_t_xwz(self): - if self.model_t_xwz == 'auto': - if self.discrete_treatment: - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) - return _FirstStageWrapper(model_t_xwz, False, self._gen_featurizer(), - False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t_xwz, self.discrete_treatment, self.random_state) def _gen_model_final(self): return clone(self.model_final, safe=False) def _gen_ortho_learner_model_nuisance(self): - return _BaseDMLIVModelNuisance(self._gen_model_y_xw(), self._gen_model_t_xw(), self._gen_model_t_xwz()) + return _BaseDMLIVNuisanceSelector(self._gen_model_y_xw(), self._gen_model_t_xw(), self._gen_model_t_xwz()) def _gen_ortho_learner_model_final(self): return _BaseDMLIVModelFinal(_FinalWrapper(self._gen_model_final(), @@ -1579,42 +1530,19 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y_xw(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) - return _FirstStageWrapper(model_y_xw, True, self._gen_featurizer(), - False, False) + return _make_first_stage_selector(self.model_y_xw, False, self.random_state) def _gen_model_t_xw(self): - if self.model_t_xw == 'auto': - if self.discrete_treatment: - model_t_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xw = clone(self.model_t_xw, safe=False) - return _FirstStageWrapper(model_t_xw, False, self._gen_featurizer(), - False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t_xw, self.discrete_treatment, self.random_state) def _gen_model_t_xwz(self): - if self.model_t_xwz == 'auto': - if self.discrete_treatment: - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) - return _FirstStageWrapper(model_t_xwz, False, self._gen_featurizer(), - False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t_xwz, self.discrete_treatment, self.random_state) def _gen_model_final(self): return clone(self.model_final, safe=False) def _gen_ortho_learner_model_nuisance(self): - return _BaseDMLIVModelNuisance(self._gen_model_y_xw(), self._gen_model_t_xw(), self._gen_model_t_xwz()) + return _BaseDMLIVNuisanceSelector(self._gen_model_y_xw(), self._gen_model_t_xw(), self._gen_model_t_xwz()) def _gen_ortho_learner_model_final(self): return _BaseDMLIVModelFinal(_FinalWrapper(self._gen_model_final(), diff --git a/econml/iv/dr/_dr.py b/econml/iv/dr/_dr.py index c06df6278..e4bbb81b9 100644 --- a/econml/iv/dr/_dr.py +++ b/econml/iv/dr/_dr.py @@ -27,16 +27,23 @@ LinearCateEstimator) from ...inference import StatsModelsInference from ...sklearn_extensions.linear_model import StatsModelsLinearRegression, DebiasedLasso, WeightedLassoCVWrapper -from ...sklearn_extensions.model_selection import WeightedStratifiedKFold +from ...sklearn_extensions.model_selection import ModelSelector, SingleModelSelector, WeightedStratifiedKFold from ...utilities import (_deprecate_positional, add_intercept, filter_none_kwargs, inverse_onehot, get_feature_names_or_default, check_high_dimensional, check_input_arrays) from ...grf import RegressionForest -from ...dml.dml import _FirstStageWrapper, _FinalWrapper +from ...dml.dml import _make_first_stage_selector, _FinalWrapper from ...iv.dml import NonParamDMLIV from ..._shap import _shap_explain_model_cate -class _BaseDRIVModelNuisance: +def _combine(W, Z, n_samples): + if Z is not None: # Z will not be None + Z = Z.reshape(n_samples, -1) + return Z if W is None else np.hstack([W, Z]) + return None if W is None else W + + +class _BaseDRIVNuisanceSelector(ModelSelector): def __init__(self, *, prel_model_effect, model_y_xw, model_t_xw, model_tz_xw, model_z, projection, fit_cov_directly, discrete_treatment, discrete_instrument): @@ -53,22 +60,30 @@ def __init__(self, *, prel_model_effect, model_y_xw, model_t_xw, model_tz_xw, mo else: self._model_z_xw = model_z - def _combine(self, W, Z, n_samples): - if Z is not None: # Z will not be None - Z = Z.reshape(n_samples, -1) - return Z if W is None else np.hstack([W, Z]) - return None if W is None else W - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): # T and Z only allow single continuous or binary, keep the shape of (n,) for continuous and (n,1) for binary T = T.ravel() if not self._discrete_treatment else T Z = Z.ravel() if not self._discrete_instrument else Z - self._model_y_xw.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) - self._model_t_xw.fit(X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) + self._model_y_xw.train(is_selecting, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) + self._model_t_xw.train(is_selecting, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) + if is_selecting and self._fit_cov_directly: + # need to fit, too, since we call predict later inside this train method + self._model_t_xw.train(False, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) + + if self._projection: + WZ = _combine(W, Z, Y.shape[0]) + self._model_t_xwz.train(is_selecting, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) + if is_selecting: + # need to fit, too, since we call predict later inside this train method + self._model_t_xwz.train(False, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) + else: + self._model_z_xw.train(is_selecting, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) + if is_selecting: + # need to fit, too, since we call predict later inside this train method + self._model_z_xw.train(False, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) + if self._projection: - WZ = self._combine(W, Z, Y.shape[0]) - self._model_t_xwz.fit(X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) T_proj = self._model_t_xwz.predict(X, WZ).reshape(T.shape) if self._fit_cov_directly: # We're projecting, so we're treating E[T|X,Z] as the instrument (ignoring W for simplicity) @@ -82,15 +97,14 @@ def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): else: T_pred = T_pred.reshape(T.shape) target = (T_proj - T_pred)**2 - self._model_tz_xw.fit(X=X, W=W, Target=target, - sample_weight=sample_weight, groups=groups) + self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target, + sample_weight=sample_weight, groups=groups) else: # return shape (n,) target = (T * T_proj).reshape(T.shape[0],) - self._model_tz_xw.fit(X=X, W=W, Target=target, - sample_weight=sample_weight, groups=groups) + self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target, + sample_weight=sample_weight, groups=groups) else: - self._model_z_xw.fit(X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) if self._fit_cov_directly: Z_pred = self._model_z_xw.predict(X, W) T_pred = self._model_t_xw.predict(X, W) @@ -111,10 +125,10 @@ def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): target_shape = Z_res.shape if Z_res.ndim > 1 else T_res.shape target = T_res.reshape(target_shape) * Z_res.reshape(target_shape) # TODO: if the T and Z models overfit, then this will be biased towards 0; - # consider using nested cross-fitting here + # consider using nested cross-fitting # a similar comment applies to the projection case - self._model_tz_xw.fit(X=X, W=W, Target=target, - sample_weight=sample_weight, groups=groups) + self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target, + sample_weight=sample_weight, groups=groups) else: if self._discrete_treatment: if self._discrete_instrument: @@ -130,8 +144,8 @@ def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): else: # shape(n,) target = T * Z - self._model_tz_xw.fit(X=X, W=W, Target=target, - sample_weight=sample_weight, groups=groups) + self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target, + sample_weight=sample_weight, groups=groups) # TODO: prel_model_effect could allow sample_var and freq_weight? if self._discrete_instrument: @@ -168,7 +182,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): if self._projection: if hasattr(self._model_t_xwz, 'score'): - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) t_xwz_score = self._model_t_xwz.score(X=X, W=WZ, Target=T, sample_weight=sample_weight) else: t_xwz_score = None @@ -232,7 +246,7 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None) if self._projection: # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) T_proj = self._model_t_xwz.predict(X, WZ).reshape(T.shape) Z_res = T_proj - T_pred if self._fit_cov_directly: @@ -650,86 +664,38 @@ def _gen_prel_model_effect(self): return clone(self.prel_model_effect, safe=False) def _gen_ortho_learner_model_nuisance(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) - - if self.model_t_xw == 'auto': - if self.discrete_treatment: - model_t_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xw = clone(self.model_t_xw, safe=False) + model_y_xw = _make_first_stage_selector(self.model_y_xw, False, self.random_state) + model_t_xw = _make_first_stage_selector(self.model_t_xw, self.discrete_treatment, self.random_state) if self.projection: # this is a regression model since proj_t is probability - if self.model_tz_xw == "auto": - model_tz_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_tz_xw = clone(self.model_tz_xw, safe=False) + model_tz_xw = _make_first_stage_selector(self.model_tz_xw, + is_discrete=False, + random_state=self.random_state) - if self.model_t_xwz == 'auto': - if self.discrete_treatment: - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) - - return _BaseDRIVModelNuisance(prel_model_effect=self._gen_prel_model_effect(), - model_y_xw=_FirstStageWrapper( - model_y_xw, True, self._gen_featurizer(), False, False), - model_t_xw=_FirstStageWrapper(model_t_xw, False, self._gen_featurizer(), - False, self.discrete_treatment), - # outcome is continuous since proj_t is probability - model_tz_xw=_FirstStageWrapper(model_tz_xw, False, self._gen_featurizer(), - False, False), - model_z=_FirstStageWrapper(model_t_xwz, False, self._gen_featurizer(), - False, self.discrete_treatment), - projection=self.projection, - fit_cov_directly=self.fit_cov_directly, - discrete_treatment=self.discrete_treatment, - discrete_instrument=self.discrete_instrument) + # we're using E[T|X,W,Z] as the instrument + model_z = _make_first_stage_selector(self.model_t_xwz, + is_discrete=self.discrete_treatment, + random_state=self.random_state) else: - if self.model_tz_xw == "auto": - if self.discrete_treatment and self.discrete_instrument and not self.fit_cov_directly: - model_tz_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_tz_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_tz_xw = clone(self.model_tz_xw, safe=False) + model_tz_xw = _make_first_stage_selector(self.model_tz_xw, is_discrete=(self.discrete_treatment and + self.discrete_instrument and + not self.fit_cov_directly), + random_state=self.random_state) - if self.model_z_xw == 'auto': - if self.discrete_instrument: - model_z_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_z_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_z_xw = clone(self.model_z_xw, safe=False) - - return _BaseDRIVModelNuisance(prel_model_effect=self._gen_prel_model_effect(), - model_y_xw=_FirstStageWrapper( - model_y_xw, True, self._gen_featurizer(), False, False), - model_t_xw=_FirstStageWrapper(model_t_xw, False, self._gen_featurizer(), - False, self.discrete_treatment), - model_tz_xw=_FirstStageWrapper(model_tz_xw, False, self._gen_featurizer(), - False, (self.discrete_treatment and - self.discrete_instrument and - not self.fit_cov_directly)), - model_z=_FirstStageWrapper(model_z_xw, False, self._gen_featurizer(), - False, (self.discrete_instrument and - not self.fit_cov_directly)), - projection=self.projection, - fit_cov_directly=self.fit_cov_directly, - discrete_treatment=self.discrete_treatment, - discrete_instrument=self.discrete_instrument) + model_z = _make_first_stage_selector(self.model_z_xw, is_discrete=self.discrete_instrument, + random_state=self.random_state) + + return _BaseDRIVNuisanceSelector(prel_model_effect=self._gen_prel_model_effect(), + model_y_xw=model_y_xw, + model_t_xw=model_t_xw, + model_tz_xw=model_tz_xw, + model_z=model_z, + projection=self.projection, + fit_cov_directly=self.fit_cov_directly, + discrete_treatment=self.discrete_treatment, + discrete_instrument=self.discrete_instrument) class DRIV(_DRIV): @@ -1090,7 +1056,7 @@ def models_y_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_y_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_y_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t_xw(self): @@ -1104,7 +1070,7 @@ def models_t_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_t_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_z_xw(self): @@ -1120,7 +1086,7 @@ def models_z_xw(self): """ if self.projection: raise AttributeError("Projection model is fitted for instrument! Use models_t_xwz.") - return [[mdl._model_z_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_z_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t_xwz(self): @@ -1136,7 +1102,7 @@ def models_t_xwz(self): """ if not self.projection: raise AttributeError("Direct model is fitted for instrument! Use models_z_xw.") - return [[mdl._model_t_xwz._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t_xwz.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_tz_xw(self): @@ -1150,7 +1116,7 @@ def models_tz_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_tz_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_tz_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_prel_model_effect(self): @@ -2342,25 +2308,23 @@ def model_final(self, model): raise ValueError("Parameter `model_final` cannot be altered for this estimator!") -class _IntentToTreatDRIVModelNuisance: - def __init__(self, model_y_xw, model_t_xwz, dummy_z, prel_model_effect): - self._model_y_xw = clone(model_y_xw, safe=False) - self._model_t_xwz = clone(model_t_xwz, safe=False) - self._dummy_z = clone(dummy_z, safe=False) - self._prel_model_effect = clone(prel_model_effect, safe=False) - - def _combine(self, W, Z, n_samples): - if Z is not None: # Z will not be None - Z = Z.reshape(n_samples, -1) - return Z if W is None else np.hstack([W, Z]) - return None if W is None else W +class _IntentToTreatDRIVNuisanceSelector(ModelSelector): + def __init__(self, + model_y_xw: SingleModelSelector, + model_t_xwz: SingleModelSelector, + dummy_z: SingleModelSelector, + prel_model_effect): + self._model_y_xw = model_y_xw + self._model_t_xwz = model_t_xwz + self._dummy_z = dummy_z + self._prel_model_effect = prel_model_effect - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): - self._model_y_xw.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + self._model_y_xw.train(is_selecting, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) - self._model_t_xwz.fit(X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) - self._dummy_z.fit(X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) + WZ = _combine(W, Z, Y.shape[0]) + self._model_t_xwz.train(is_selecting, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) + self._dummy_z.train(is_selecting, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) # we need to undo the one-hot encoding for calling effect, # since it expects raw values self._prel_model_effect.fit(Y, inverse_onehot(T), Z=inverse_onehot(Z), X=X, W=W, @@ -2374,7 +2338,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): Y_X_score = None if hasattr(self._model_t_xwz, 'score'): # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) T_XZ_score = self._model_t_xwz.score(X=X, W=WZ, Target=T, sample_weight=sample_weight) else: T_XZ_score = None @@ -2390,8 +2354,8 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): Y_pred = self._model_y_xw.predict(X, W) - T_pred_zero = self._model_t_xwz.predict(X, self._combine(W, np.zeros(Z.shape), Y.shape[0])) - T_pred_one = self._model_t_xwz.predict(X, self._combine(W, np.ones(Z.shape), Y.shape[0])) + T_pred_zero = self._model_t_xwz.predict(X, _combine(W, np.zeros(Z.shape), Y.shape[0])) + T_pred_one = self._model_t_xwz.predict(X, _combine(W, np.ones(Z.shape), Y.shape[0])) Z_pred = self._dummy_z.predict(X, W) prel_theta = self._prel_model_effect.effect(X) @@ -2486,16 +2450,8 @@ def _gen_prel_model_effect(self): return clone(self.prel_model_effect, safe=False) def _gen_ortho_learner_model_nuisance(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) - - if self.model_t_xwz == 'auto': - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) + model_y_xw = _make_first_stage_selector(self.model_y_xw, is_discrete=False, random_state=self.random_state) + model_t_xwz = _make_first_stage_selector(self.model_t_xwz, is_discrete=True, random_state=self.random_state) if self.z_propensity == "auto": dummy_z = DummyClassifier(strategy="prior") @@ -2504,14 +2460,9 @@ def _gen_ortho_learner_model_nuisance(self): else: raise ValueError("Only 'auto' or float is allowed!") - return _IntentToTreatDRIVModelNuisance(_FirstStageWrapper(model_y_xw, True, self._gen_featurizer(), - False, False), - _FirstStageWrapper(model_t_xwz, False, - self._gen_featurizer(), False, True), - _FirstStageWrapper(dummy_z, False, - self._gen_featurizer(), False, True), - self._gen_prel_model_effect() - ) + dummy_z = _make_first_stage_selector(dummy_z, is_discrete=True, random_state=self.random_state) + + return _IntentToTreatDRIVNuisanceSelector(model_y_xw, model_t_xwz, dummy_z, self._gen_prel_model_effect()) class _DummyCATE: diff --git a/econml/new_tests/test_model_selection.py b/econml/new_tests/test_model_selection.py deleted file mode 100644 index 1eb82db0b..000000000 --- a/econml/new_tests/test_model_selection.py +++ /dev/null @@ -1,267 +0,0 @@ -import unittest - -import numpy as np -from econml.sklearn_extensions.model_selection import * -from econml.sklearn_extensions.model_selection_utils import * -from sklearn.datasets import fetch_california_housing, load_iris -from sklearn.preprocessing import StandardScaler -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score, f1_score -from sklearn.pipeline import make_pipeline -from sklearn.svm import SVR - - -class TestSearchEstimatorListClassifier(unittest.TestCase): - def setUp(self): - self.expected_accuracy = 0.9 - self.expected_f1_score = 0.9 - self.accuracy_tolerance = 0.05 - self.f1_score_tolerance = 0.05 - self.is_discrete = True - X, y = load_iris(return_X_y=True) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=42) - self.X_train = X_train - self.y_train = y_train - self.X_test = X_test - self.y_test = y_test - - def test_initialization(self): - with self.assertRaises(ValueError): - SearchEstimatorList(estimator_list='invalid_estimator') - - def test_auto_param_grid_discrete(self): - - search_estimator_list = SearchEstimatorList(is_discrete=self.is_discrete, scaling=False) - search_estimator_list.fit(self.X_train, self.y_train) - self.assertIsNotNone(search_estimator_list.best_estimator_) - self.assertIsNotNone(search_estimator_list.best_score_) - self.assertIsNotNone(search_estimator_list.best_params_) - - def test_linear_estimator(self): - search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_poly_estimator(self): - search = SearchEstimatorList(estimator_list='poly', is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertTrue(is_polynomial_pipeline(search.complete_estimator_list[0])) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_forest_estimator(self): - search = SearchEstimatorList(estimator_list='forest', is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], RandomForestClassifier) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_gbf_estimator(self): - search = SearchEstimatorList(estimator_list='gbf', is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], GradientBoostingClassifier) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_nnet_estimator(self): - search = SearchEstimatorList(estimator_list='nnet', is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], MLPClassifier) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_linear_and_forest_estimators(self): - search = SearchEstimatorList(estimator_list=['linear', 'forest'], is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 2) - self.assertEqual(len(search.param_grid_list), 2) - self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) - self.assertIsInstance(search.complete_estimator_list[1], RandomForestClassifier) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_all_estimators(self): - search = SearchEstimatorList(estimator_list=['linear', 'forest', - 'gbf', 'nnet', 'poly'], is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 5) - self.assertEqual(len(search.param_grid_list), 5) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_logistic_regression_estimator(self): - search = SearchEstimatorList(estimator_list=LogisticRegression(), is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_logistic_regression_cv_estimator(self): - search = SearchEstimatorList(estimator_list=LogisticRegressionCV(), - is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_empty_estimator_list(self): - with self.assertRaises(ValueError): - search = SearchEstimatorList(estimator_list=[], is_discrete=self.is_discrete, scaling=False) - - def test_invalid_regressor(self): - with self.assertRaises(TypeError): - estimator_list = [SVR(kernel='linear')] - search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) - - def test_polynomial_pipeline_regressor(self): - with self.assertRaises(TypeError): - estimator_list = [make_pipeline(PolynomialFeatures(), ElasticNetCV())] - search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) - - def test_mlp_regressor(self): - with self.assertRaises(TypeError): - estimator_list = [MLPRegressor()] - search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) - - def test_random_forest_regressor(self): - with self.assertRaises(TypeError): - estimator_list = [RandomForestRegressor()] - search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) - - def test_gradient_boosting_regressor(self): - with self.assertRaises(TypeError): - estimator_list = [GradientBoostingRegressor()] - search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) - - def test_combined_estimators(self): - with self.assertRaises(TypeError): - estimator_list = [LogisticRegression(), SVC(), GradientBoostingRegressor()] - search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) - - def test_random_forest_discrete(self): - estimator_list = [RandomForestClassifier()] - param_grid_list = [{'n_estimators': [10, 50, 100], 'max_depth': [3, 5, None]}] - - search = SearchEstimatorList( - estimator_list=estimator_list, param_grid_list=param_grid_list, is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - - self.assertIsNotNone(search.best_estimator_) - self.assertIsNotNone(search.best_score_) - self.assertIsNotNone(search.best_params_) - - def test_data_scaling(self): - search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, scaling=True) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_custom_scoring_function(self): - def custom_scorer(y_true, y_pred): - return f1_score(y_true, y_pred, average='macro') - - search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, - scaling=False, scoring=custom_scorer) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - # def test_refit_false(self): - # search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, scaling=False, refit=False) - # search.fit(self.X_train, self.y_train) - # with self.assertRaises(NotFittedError): - # y_pred = search.predict(self.X_test) - - def test_custom_random_state(self): - search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, - scaling=False, random_state=42) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - - def test_invalid_incorrect_scoring_numbers(self): - with self.assertRaises(ValueError): - search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, - scaling=False, scoring=123) - - -if __name__ == '__main__': - unittest.main() diff --git a/econml/new_tests/test_model_selection_utils.py b/econml/new_tests/test_model_selection_utils.py deleted file mode 100644 index 8e7e7c917..000000000 --- a/econml/new_tests/test_model_selection_utils.py +++ /dev/null @@ -1,235 +0,0 @@ -import unittest - -import numpy as np -from econml.sklearn_extensions.model_selection import * -from econml.sklearn_extensions.model_selection_utils import * -from sklearn.datasets import fetch_california_housing, load_iris -from sklearn.preprocessing import StandardScaler, PolynomialFeatures -from sklearn.model_selection import train_test_split -from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV - - -class TestIsDataScaled(unittest.TestCase): - - def test_scaled_data(self): - # Test with data that is already centered and scaled - X = np.array([[0.0, -1.0], [1.0, 0.0], [-1.0, 1.0]]) - scale = StandardScaler() - scaled_X = scale.fit_transform(X) - self.assertTrue(is_data_scaled(scaled_X)) - - def test_unscaled_data(self): - # Test with data that is not centered and scaled - X = np.array([[1.0, 2.0], [2.0, 3.0], [3.0, 4.0]]) - self.assertFalse(is_data_scaled(X)) - - def test_large_scaled_data(self): - # Test with a larger dataset that is already centered and scaled - np.random.seed(42) - X = np.random.randn(1000, 5) - scale = StandardScaler() - scaled_X = scale.fit_transform(X) - self.assertTrue(is_data_scaled(scaled_X)) - - def test_large_unscaled_data(self): - np.random.seed(42) - X = np.random.randn(1000, 5) - self.assertFalse(is_data_scaled(X)) - - def test_is_data_scaled_with_scaled_iris_dataset(self): - X, y = load_iris(return_X_y=True) - scaler = StandardScaler() - X_scaled = scaler.fit_transform(X) - assert is_data_scaled(X_scaled) == True - - def test_is_data_scaled_with_unscaled_iris_dataset(self): - X, y = load_iris(return_X_y=True) - assert is_data_scaled(X) == False - - def test_is_data_scaled_with_scaled_california_housing_dataset(self): - X, y = housing = fetch_california_housing(return_X_y=True) - scaler = StandardScaler() - X_scaled = scaler.fit_transform(X) - assert is_data_scaled(X_scaled) == True - - def test_is_data_scaled_with_unscaled_california_housing_dataset(self): - X, y = fetch_california_housing(return_X_y=True) - assert is_data_scaled(X) == False - - -class TestFlattenList(unittest.TestCase): - - def test_flatten_empty_list(self): - input = [] - expected_output = [] - self.assertEqual(flatten_list(input), expected_output) - - def test_flatten_simple_list(self): - input = [1, 10, 15] - expected_output = [1, 10, 15] - self.assertEqual(flatten_list(input), expected_output) - - def test_flatten_nested_list(self): - input = [1, [10, 15], [20, [25, 30]]] - expected_output = [1, 10, 15, 20, 25, 30] - self.assertEqual(flatten_list(input), expected_output) - - # Check functionality for below - # def test_flatten_none_list(self): - # input = [[1, 10, None], 15, None] - # expected_output = [1, 10, None, 15, None] - # self.assertEqual(flatten_list(input), expected_output) - - def test_flatten_iris_dataset(self): - X = load_iris() - input = X.data.tolist() - expected_output = sum(X.data.tolist(), []) - self.assertEqual(flatten_list(input), expected_output) - - def test_flatten_california_housing_dataset(self): - X = fetch_california_housing() - input = X.data.tolist() - expected_output = sum(X.data.tolist(), []) - self.assertEqual(flatten_list(input), expected_output) - - -class TestIsPolynomialPipeline(unittest.TestCase): - - def test_is_polynomial_pipeline_true(self): - X = np.array([[5, 10], [15, 20], [25, 30], [35, 40], [45, 50]]) - y = np.array([15, 29, 38, 47, 55]) - scaler = StandardScaler() - X_scaled = scaler.fit_transform(X) - model = Pipeline([ - ('poly', PolynomialFeatures(degree=2)), - ('linear', ElasticNetCV()) - ]) - model.fit(X_scaled, y) - assert is_polynomial_pipeline(model) == True - - def test_is_polynomial_pipeline_false(self): - model = ElasticNetCV() - assert is_polynomial_pipeline(model) == False - - def test_is_polynomial_pipeline_false_step_number(self): - X, y = load_iris(return_X_y=True) - model = Pipeline([ - ('poly', PolynomialFeatures(degree=2)), - ('linear', LogisticRegressionCV()), - ('step_false', '') - ]) - assert is_polynomial_pipeline(model) == False - - def test_is_polynomial_pipeline_interchange_steps(self): - X, y = load_iris(return_X_y=True) - model = Pipeline([ - ('poly', LogisticRegressionCV()), - ('linear', PolynomialFeatures(degree=2)), - ]) - assert is_polynomial_pipeline(model) == False - - # Cross-check functionaity - can the 'poly' keyword be changed to something else - def test_is_polynomial_pipeline_false_first_step(self): - X, y = fetch_california_housing(return_X_y=True) - model = Pipeline([ - ('not_poly', PolynomialFeatures(degree=2)), - ('linear', ElasticNetCV()) - ]) - assert is_polynomial_pipeline(model) == True - - -class TestCheckListType(unittest.TestCase): - - def test_check_list_type_true(self): - list = ['linear', LogisticRegressionCV(), KFold()] - assert check_list_type(list) == True - - def test_check_list_type_false_string(self): - list = [18, LogisticRegressionCV(), KFold()] - try: - check_list_type(list) - except TypeError as e: - assert str(e) == "The list must contain only strings, sklearn model objects, and sklearn model selection objects." - - def test_check_list_type_empty(self): - list = [] - try: - check_list_type(list) - except ValueError as e: - assert str(e) == "Estimator list is empty. Please add some models or use some of the defaults provided." - - def test_check_list_type_all_strings(self): - list = ['linear', 'lasso', 'forest'] - assert check_list_type(list) == True - - def test_check_list_type_all_models(self): - list = [LogisticRegressionCV(), ElasticNetCV()] - assert check_list_type(list) == True - - def test_check_list_duplicate_models_strings(self): - list = [LogisticRegressionCV(), LogisticRegressionCV(), 'linear', 'linear'] - assert check_list_type(list) == True - - -class TestSelectContinuousEstimator(unittest.TestCase): - - def test_select_continuous_estimator_valid(self): - assert isinstance(select_continuous_estimator('linear'), ElasticNetCV) - assert isinstance(select_continuous_estimator('forest'), RandomForestRegressor) - assert isinstance(select_continuous_estimator('gbf'), GradientBoostingRegressor) - assert isinstance(select_continuous_estimator('nnet'), MLPRegressor) - assert isinstance(select_continuous_estimator('poly'), Pipeline) - - def test_select_continuous_estimator_invalid(self): - try: - select_continuous_estimator('ridge') - except ValueError as e: - assert str(e) == 'Unsupported estimator type: ridge' - - -class TestSelectDiscreteEstimator(unittest.TestCase): - - def test_select_discrete_estimator_valid(self): - assert isinstance(select_discrete_estimator('linear'), LogisticRegressionCV) - assert isinstance(select_discrete_estimator('forest'), RandomForestClassifier) - assert isinstance(select_discrete_estimator('gbf'), GradientBoostingClassifier) - assert isinstance(select_discrete_estimator('nnet'), MLPClassifier) - assert isinstance(select_discrete_estimator('poly'), Pipeline) - - def test_select_discrete_estimator_invalid(self): - try: - select_discrete_estimator('lasso') - except ValueError as e: - assert str(e) == 'Unsupported estimator type: lasso' - - -class TestSelectEstimator(unittest.TestCase): - - def test_select_estimator_valid(self): - assert isinstance(select_estimator('linear', is_discrete=False), ElasticNetCV) - assert isinstance(select_estimator('forest', is_discrete=False), RandomForestRegressor) - assert isinstance(select_estimator('gbf', is_discrete=False), GradientBoostingRegressor) - assert isinstance(select_estimator('nnet', is_discrete=False), MLPRegressor) - assert isinstance(select_estimator('poly', is_discrete=False), Pipeline) - - assert isinstance(select_estimator('linear', is_discrete=True), LogisticRegression) - assert isinstance(select_estimator('forest', is_discrete=True), RandomForestClassifier) - assert isinstance(select_estimator('gbf', is_discrete=True), GradientBoostingClassifier) - assert isinstance(select_estimator('nnet', is_discrete=True), MLPClassifier) - assert isinstance(select_estimator('poly', is_discrete=True), Pipeline) - - def test_select_estimator_invalid_estimator(self): - try: - select_estimator('lasso', is_discrete=True) - except ValueError as e: - assert str(e) == 'Unsupported estimator type: lasso' - - def test_select_estimator_invalid(self): - try: - select_estimator('linear', is_discrete=None) - except ValueError as e: - assert str(e) == 'Unsupported target type: None' - - -if __name__ == '__main__': - unittest.main() diff --git a/econml/panel/dml/_dml.py b/econml/panel/dml/_dml.py index c3dc96a4e..97190639b 100644 --- a/econml/panel/dml/_dml.py +++ b/econml/panel/dml/_dml.py @@ -9,13 +9,13 @@ from scipy.stats import norm from sklearn.linear_model import (ElasticNetCV, LassoCV, LogisticRegressionCV) from ...sklearn_extensions.linear_model import (StatsModelsLinearRegression, WeightedLassoCVWrapper) -from ...sklearn_extensions.model_selection import WeightedStratifiedKFold -from ...dml.dml import _FirstStageWrapper, _FinalWrapper +from ...sklearn_extensions.model_selection import ModelSelector, WeightedStratifiedKFold +from ...dml.dml import _make_first_stage_selector, _FinalWrapper from ..._cate_estimator import TreatmentExpansionMixin, LinearModelFinalCateEstimatorMixin from ..._ortho_learner import _OrthoLearner from ...utilities import (_deprecate_positional, add_intercept, broadcast_unit_treatments, check_high_dimensional, - cross_product, deprecated, fit_with_groups, + cross_product, deprecated, hstack, inverse_onehot, ndim, reshape, reshape_treatmentwise_effects, shape, transpose, get_feature_names_or_default, check_input_arrays, @@ -33,7 +33,7 @@ def _get_groups_period_filter(groups, n_periods): return group_period_filter -class _DynamicModelNuisance: +class _DynamicModelNuisanceSelector(ModelSelector): """ Nuisance model fits the model_y and model_t at fit time and at predict time calculates the residual Y and residual T based on the fitted models and returns @@ -45,21 +45,27 @@ def __init__(self, model_y, model_t, n_periods): self._model_t = model_t self.n_periods = n_periods - def fit(self, Y, T, X=None, W=None, sample_weight=None, groups=None): + def train(self, is_selecting, Y, T, X=None, W=None, sample_weight=None, groups=None): """Fit a series of nuisance models for each period or period pairs.""" assert Y.shape[0] % self.n_periods == 0, \ "Length of training data should be an integer multiple of time periods." period_filters = _get_groups_period_filter(groups, self.n_periods) - self._model_y_trained = {} - self._model_t_trained = {j: {} for j in np.arange(self.n_periods)} + if is_selecting: # create the per-period y and t models + self._model_y_trained = {t: clone(self._model_y, safe=False) + for t in np.arange(self.n_periods)} + self._model_t_trained = {j: {t: clone(self._model_t, safe=False) + for t in np.arange(j + 1)} + for j in np.arange(self.n_periods)} for t in np.arange(self.n_periods): - self._model_y_trained[t] = clone(self._model_y, safe=False).fit( + self._model_y_trained[t].train( + is_selecting, self._index_or_None(X, period_filters[t]), self._index_or_None( W, period_filters[t]), Y[period_filters[self.n_periods - 1]]) for j in np.arange(t, self.n_periods): - self._model_t_trained[j][t] = clone(self._model_t, safe=False).fit( + self._model_t_trained[j][t].train( + is_selecting, self._index_or_None(X, period_filters[t]), self._index_or_None(W, period_filters[t]), T[period_filters[j]]) @@ -534,30 +540,18 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y(self): - if self.model_y == 'auto': - model_y = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y = clone(self.model_y, safe=False) - return _FirstStageWrapper(model_y, True, self._gen_featurizer(), - self.linear_first_stages, self.discrete_treatment) + return _make_first_stage_selector(self.model_y, is_discrete=False, random_state=self.random_state) def _gen_model_t(self): - if self.model_t == 'auto': - if self.discrete_treatment: - model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t = clone(self.model_t, safe=False) - return _FirstStageWrapper(model_t, False, self._gen_featurizer(), - self.linear_first_stages, self.discrete_treatment) + return _make_first_stage_selector(self.model_t, + is_discrete=self.discrete_treatment, + random_state=self.random_state) def _gen_model_final(self): return StatsModelsLinearRegression(fit_intercept=False) def _gen_ortho_learner_model_nuisance(self): - return _DynamicModelNuisance( + return _DynamicModelNuisanceSelector( model_t=self._gen_model_t(), model_y=self._gen_model_y(), n_periods=self._n_periods) diff --git a/econml/sklearn_extensions/linear_model.py b/econml/sklearn_extensions/linear_model.py index 0c90c6868..8045d23bf 100644 --- a/econml/sklearn_extensions/linear_model.py +++ b/econml/sklearn_extensions/linear_model.py @@ -20,8 +20,7 @@ import warnings from collections.abc import Iterable from scipy.stats import norm -from econml.sklearn_extensions.model_selection import WeightedKFold, WeightedStratifiedKFold -from econml.utilities import ndim, shape, reshape, _safe_norm_ppf, check_input_arrays +from ..utilities import ndim, shape, reshape, _safe_norm_ppf, check_input_arrays from sklearn import clone from sklearn.linear_model import LinearRegression, LassoCV, MultiTaskLassoCV, Lasso, MultiTaskLasso from sklearn.linear_model._base import _preprocess_data @@ -41,7 +40,24 @@ from typing import List +class _WeightedCVIterableWrapper(_CVIterableWrapper): + def __init__(self, cv): + super().__init__(cv) + + def get_n_splits(self, X=None, y=None, groups=None, sample_weight=None): + if groups is not None and sample_weight is not None: + raise ValueError("Cannot simultaneously use grouping and weighting") + return super().get_n_splits(X, y, groups) + + def split(self, X=None, y=None, groups=None, sample_weight=None): + if groups is not None and sample_weight is not None: + raise ValueError("Cannot simultaneously use grouping and weighting") + return super().split(X, y, groups) + + def _weighted_check_cv(cv=5, y=None, classifier=False, random_state=None): + # local import to avoid circular imports + from .model_selection import WeightedKFold, WeightedStratifiedKFold cv = 5 if cv is None else cv if isinstance(cv, numbers.Integral): if (classifier and (y is not None) and @@ -60,21 +76,6 @@ def _weighted_check_cv(cv=5, y=None, classifier=False, random_state=None): return cv # New style cv objects are passed without any modification -class _WeightedCVIterableWrapper(_CVIterableWrapper): - def __init__(self, cv): - super().__init__(cv) - - def get_n_splits(self, X=None, y=None, groups=None, sample_weight=None): - if groups is not None and sample_weight is not None: - raise ValueError("Cannot simultaneously use grouping and weighting") - return super().get_n_splits(X, y, groups) - - def split(self, X=None, y=None, groups=None, sample_weight=None): - if groups is not None and sample_weight is not None: - raise ValueError("Cannot simultaneously use grouping and weighting") - return super().split(X, y, groups) - - class WeightedModelMixin: """Mixin class for weighted models. @@ -1204,73 +1205,90 @@ def _set_attribute(self, attribute_name, condition=True, default=None): setattr(self, attribute_name, attribute_value) -class WeightedLassoCVWrapper: - """Helper class to wrap either WeightedLassoCV or WeightedMultiTaskLassoCV depending on the shape of the target.""" +class _PairedEstimatorWrapper: + """Helper class to wrap two different estimators, one of which can be used only with single targets and the other + which can be used on multiple targets. Not intended to be used directly by users.""" + + _SingleEst = None + _MultiEst = None + _known_params = [] + _post_fit_attrs = [] def __init__(self, *args, **kwargs): self.args = args self.kwargs = kwargs - # set model to WeightedLassoCV by default so there's always a model to get and set attributes on - self.model = WeightedLassoCV(*args, **kwargs) - - # whitelist known params because full set is not necessarily identical between LassoCV and MultiTaskLassoCV - # (e.g. former has 'positive' and 'precompute' while latter does not) - known_params = set(['eps', 'n_alphas', 'alphas', 'fit_intercept', 'normalize', 'max_iter', 'tol', 'copy_X', - 'cv', 'verbose', 'n_jobs', 'random_state', 'selection']) + # set model to the single-target estimator by default so there's always a model to get and set attributes on + self.model = self._SingleEst(*args, **kwargs) def fit(self, X, y, sample_weight=None): - self.needs_unravel = False + self._needs_unravel = False params = {key: value for (key, value) in self.get_params().items() - if key in self.known_params} + if key in self._known_params} if ndim(y) == 2 and shape(y)[1] > 1: - self.model = WeightedMultiTaskLassoCV(**params) + self.model = self._MultiEst(**params) else: if ndim(y) == 2 and shape(y)[1] == 1: y = np.ravel(y) - self.needs_unravel = True - self.model = WeightedLassoCV(**params) + self._needs_unravel = True + self.model = self._SingleEst(**params) self.model.fit(X, y, sample_weight) - # set intercept_ attribute - self.intercept_ = self.model.intercept_ - # set coef_ attribute - self.coef_ = self.model.coef_ - # set alpha_ attribute - self.alpha_ = self.model.alpha_ - # set alphas_ attribute - self.alphas_ = self.model.alphas_ - # set n_iter_ attribute - self.n_iter_ = self.model.n_iter_ + for param in self._post_fit_attrs: + setattr(self, param, getattr(self.model, param)) return self def predict(self, X): predictions = self.model.predict(X) - return reshape(predictions, (-1, 1)) if self.needs_unravel else predictions + return reshape(predictions, (-1, 1)) if self._needs_unravel else predictions def score(self, X, y, sample_weight=None): return self.model.score(X, y, sample_weight) def __getattr__(self, key): - if key in self.known_params: + if key in self._known_params: return getattr(self.model, key) else: raise AttributeError("No attribute " + key) def __setattr__(self, key, value): - if key in self.known_params: + if key in self._known_params: setattr(self.model, key, value) else: super().__setattr__(key, value) def get_params(self, deep=True): """Get parameters for this estimator.""" - return self.model.get_params(deep=deep) + return {k: v for k, v in self.model.get_params(deep=deep).items() if k in self._known_params} def set_params(self, **params): """Set parameters for this estimator.""" self.model.set_params(**params) +class WeightedLassoCVWrapper(_PairedEstimatorWrapper): + """Helper class to wrap either WeightedLassoCV or WeightedMultiTaskLassoCV depending on the shape of the target.""" + + _SingleEst = WeightedLassoCV + _MultiEst = WeightedMultiTaskLassoCV + + # whitelist known params because full set is not necessarily identical between LassoCV and MultiTaskLassoCV + # (e.g. former has 'positive' and 'precompute' while latter does not) + _known_params = set(['eps', 'n_alphas', 'alphas', 'fit_intercept', 'normalize', 'max_iter', 'tol', 'copy_X', + 'cv', 'verbose', 'n_jobs', 'random_state', 'selection']) + + _post_fit_attrs = set(['alpha_', 'alphas_', 'coef_', 'dual_gap_', 'intercept_', 'n_iter_', 'n_features_in_']) + + +class WeightedLassoWrapper(_PairedEstimatorWrapper): + """Helper class to wrap either WeightedLasso or WeightedMultiTaskLasso depending on the shape of the target.""" + + _SingleEst = WeightedLasso + _MultiEst = WeightedMultiTaskLasso + _known_params = set(['alpha', 'fit_intercept', 'copy_X', 'max_iter', 'tol', + 'random_state', 'selection']) + _post_fit_attrs = set(['coef_', 'dual_gap_', 'intercept_', 'n_iter_', 'n_features_in_']) + + class SelectiveRegularization: """ Estimator of a linear model where regularization is applied to only a subset of the coefficients. diff --git a/econml/sklearn_extensions/model_selection.py b/econml/sklearn_extensions/model_selection.py index d8c55538d..b123fb5a2 100644 --- a/econml/sklearn_extensions/model_selection.py +++ b/econml/sklearn_extensions/model_selection.py @@ -3,27 +3,36 @@ """Collection of scikit-learn extensions for model selection techniques.""" import numbers -import pdb import warnings +import abc import numpy as np +from collections.abc import Iterable import scipy.sparse as sp import sklearn from joblib import Parallel, delayed from sklearn.base import BaseEstimator, clone, is_classifier +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.exceptions import FitFailedWarning -from sklearn.model_selection import (BaseCrossValidator, GridSearchCV, KFold, +from sklearn.linear_model import (ElasticNet, ElasticNetCV, Lasso, LassoCV, MultiTaskElasticNet, MultiTaskElasticNetCV, + MultiTaskLasso, MultiTaskLassoCV, Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV, + LogisticRegression, LogisticRegressionCV) +from sklearn.model_selection import (BaseCrossValidator, GridSearchCV, GroupKFold, KFold, RandomizedSearchCV, StratifiedKFold, check_cv) # TODO: conisder working around relying on sklearn implementation details from sklearn.model_selection._validation import (_check_is_permutation, _fit_and_predict) -from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.utils import check_random_state, indexable from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _num_samples -from econml.sklearn_extensions.model_selection_utils import * +from .linear_model import WeightedLassoCVWrapper, WeightedLassoWrapper +from .model_selection_utils import (auto_hyperparameters, can_handle_multitask, get_complete_estimator_list, + has_random_state, is_data_scaled, is_likely_multi_task, + is_mlp, is_polynomial_pipeline, just_one_model_no_params, make_model_multi_task, + make_param_multi_task, param_grid_is_empty, supports_sample_weight) def _split_weighted_sample(self, X, y, sample_weight, is_stratified=False): @@ -261,11 +270,297 @@ def get_n_splits(self, X, y, groups=None): return self.n_splits +class ModelSelector(metaclass=abc.ABCMeta): + """ + This class enables a two-stage fitting process, where first a model is selected + by calling `train` with `is_selecting=True`, and then the selected model is fit (presumably + on a different data set) by calling train with `is_selecting=False`. + + + """ + + @abc.abstractmethod + def train(self, is_selecting: bool, *args, **kwargs): + """ + Either selects a model or fits a model, depending on the value of `is_selecting`. + """ + raise NotImplementedError("Abstract method") + + @abc.abstractmethod + def predict(self, *args, **kwargs): + """ + Predicts using the selected model; should not be called until after `train` has been used + both to select a model and to fit it. + """ + raise NotImplementedError("Abstract method") + + @abc.abstractmethod + def score(self, *args, **kwargs): + """ + Gets the score of the selected model on the given data; should not be called until after `train` has been used + both to select a model and to fit it. + """ + raise NotImplementedError("Abstract method") + + +class SingleModelSelector(ModelSelector): + """ + A model selection class that selects a single best model; + this encompasses random search, grid search, ensembling, etc. + """ + + @property + @abc.abstractmethod + def best_model(self): + raise NotImplementedError("Abstract method") + + @property + @abc.abstractmethod + def best_score(self): + raise NotImplementedError("Abstract method") + + def predict(self, *args, **kwargs): + return self.best_model.predict(*args, **kwargs) + + def predict_proba(self, *args, **kwargs): + return self.best_model.predict_proba(*args, **kwargs) + + def score(self, *args, **kwargs): + if hasattr(self.best_model, 'score'): + return self.best_model.score(*args, **kwargs) + else: + return None + + +def _fit_with_groups(model, X, y, *, groups, **kwargs): + """ + Fits a model while correctly handling grouping if necessary. + + This enables us to perform an inner-loop cross-validation of a model + which handles grouping correctly, which is not easy using typical sklearn models. + + For example, GridSearchCV and RandomSearchCV both support passing `groups` to fit, + but other CV-related estimators (e.g. LassoCV) do not, which means that GroupKFold + cannot be used as the cv instance, because the `groups` argument will never be passed through + to GroupKFold's `split` method. + + The hacky workaround here is to explicitly set the `cv` attribute to the set of + rows that GroupKFold would have generated rather than using GroupKFold as the cv instance. + """ + if groups is not None: + if hasattr(model, 'cv'): + old_cv = model.cv + # logic copied from check_cv + cv = 5 if old_cv is None else old_cv + if isinstance(cv, numbers.Integral): + cv = GroupKFold(cv) + # otherwise we will assume the user already set the cv attribute to something + # compatible with splitting with a `groups` argument + + splits = list(cv.split(X, y, groups=groups)) + try: + model.cv = splits + return model.fit(X, y, **kwargs) # drop groups from arg list + finally: + model.cv = old_cv + + # drop groups from arg list, which were already used at the outer level and may not be supported by the model + return model.fit(X, y, **kwargs) + + +class FixedModelSelector(SingleModelSelector): + """ + Model selection class that always selects the given model + """ + + def __init__(self, model): + self.model = clone(model, safe=False) + + def train(self, is_selecting, *args, groups=None, **kwargs): + # whether selecting or not, need to train the model on the data + # TODO: want to get out-of-sample score here if selecting, which + # would require cross-validation, but want to respect grouping, stratifying, etc. + _fit_with_groups(self.model, *args, groups=groups, **kwargs) + if is_selecting and hasattr(self.model, 'score'): + self._score = self.model.score(*args, **kwargs) + return self + + @property + def best_model(self): + return self.model + + @property + def best_score(self): + return self._score + + +class SklearnCVSelector(SingleModelSelector): + """ + Wraps one of sklearn's CV classes in the ModelSelector interface + """ + + def __init__(self, searcher): + self.searcher = clone(searcher) + + @staticmethod + def convertible_types(): + return {GridSearchCV, RandomizedSearchCV} | SklearnCVSelector._model_mapping().keys() + + @staticmethod + def can_wrap(model): + return any(isinstance(model, model_type) for model_type in SklearnCVSelector.convertible_types()) + + @staticmethod + def _model_mapping(): + return {LogisticRegressionCV: (LogisticRegression, + ["C", "l1_ratio"], + [], + ["classes_", "coef_", "intercept_", "n_features_in_", "n_iter_"]), + ElasticNetCV: (ElasticNet, + ["alpha", "l1_ratio"], + ["precompute"], + ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), + LassoCV: (Lasso, + ["alpha"], + ["precompute"], + ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), + RidgeCV: (Ridge, + ["alpha"], + [], + ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), + RidgeClassifierCV: (RidgeClassifier, + ["alpha"], + [], + ["label_binarizer", "coef_", "intercept_", "n_features_in_", "n_iter_"]), + MultiTaskElasticNetCV: (MultiTaskElasticNet, + ["alpha", "l1_ratio"], + ["precompute"], + ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), + MultiTaskLassoCV: (MultiTaskLasso, + ["alpha"], + [], + ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), + WeightedLassoCVWrapper: (WeightedLassoWrapper, + ["alpha"], + [], + ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]) + } + + def train(self, is_selecting: bool, *args, groups=None, **kwargs): + if is_selecting: + + _fit_with_groups(self.searcher, *args, groups=groups, **kwargs) + self._best_model = self._extract_best_model() + # TODO: ideally, want the out-of-sample score here instead; + # but this is not exposed in a consistent way + self._best_score = self.searcher.score(*args, **kwargs) + else: + # don't need to use _fit_with_groups here since none of these models support it + self.best_model.fit(*args, **kwargs) + return self + + @property + def best_model(self): + return self._best_model + + @property + def best_score(self): + return self._best_score + + def _extract_best_model(self): + if isinstance(self.searcher, GridSearchCV) or isinstance(self.searcher, RandomizedSearchCV): + return self.searcher.best_estimator_ + else: + for known_type in self._model_mapping().keys(): + if isinstance(self.searcher, known_type): + model_type, opt_params, strip_params, fit_vars = self._model_mapping()[known_type] + model = model_type() + # set all shared parameters + for param in model.get_params().keys() & self.searcher.get_params().keys() - set(strip_params): + setattr(model, param, getattr(self.searcher, param)) + # update learned hyperparameters with best values + for param in opt_params: + setattr(model, param, getattr(self.searcher, param + "_")) + # set all fitted variables + for var in fit_vars: + setattr(model, var, getattr(self.searcher, var)) + return model + raise ValueError(f"Unsupported type: {type(self.searcher)}") + + +class ListSelector(SingleModelSelector): + """ + Model selection class that selects the best model from a list of model selectors + + Parameters + ---------- + models : list of ModelSelector + The list of model selectors to choose from + unwrap : bool, default True + Whether to return the best model's best model, rather than just the outer best model selector + """ + + def __init__(self, models, unwrap=True): + self.models = [clone(model, safe=False) for model in models] + self.unwrap = unwrap + + def train(self, is_selecting, *args, **kwargs): + if is_selecting: + scores = [] + for model in self.models: + model.train(is_selecting, *args, **kwargs) + scores.append(model.best_score) + self._all_scores = scores + self._best_score = np.max(scores) + self._best_model = self.models[np.argmax(scores)] + + else: + self._best_model.train(is_selecting, *args, **kwargs) + + @property + def best_model(self): + """ + Gets the best model; note that if we were selecting over SingleModelSelectors and `unwrap` is `False`, + we will return the SingleModelSelector instance, not its best model. + """ + return self._best_model.best_model if self.unwrap else self._best_model + + @property + def best_score(self): + return self._best_score + + +def get_selector(input, is_discrete, *, random_state=None, cv=None, wrapper=GridSearchCV): + named_models = { + 'linear': (LogisticRegressionCV(random_state=random_state, cv=cv) if is_discrete + else WeightedLassoCVWrapper(random_state=random_state, cv=cv)), + 'forest': (RandomForestClassifier(random_state=random_state) if is_discrete + else RandomForestRegressor(random_state=random_state)), + } + if isinstance(input, ModelSelector): # we've already got a model selector, don't need to do anything + return input + elif isinstance(input, list): # we've got a list; call get_selector on each element, then wrap in a ListSelector + models = [get_selector(model, is_discrete, + random_state=random_state, cv=cv, wrapper=wrapper) + for model in input] + return ListSelector(models) + elif isinstance(input, str): # we've got a string; look it up + if input in named_models: + return get_selector(named_models[input], is_discrete, + random_state=random_state, cv=cv, wrapper=wrapper) + else: + raise ValueError(f"Unknown model type: {input}, must be one of {named_models.keys()}") + elif SklearnCVSelector.can_wrap(input): + return SklearnCVSelector(input) + else: # assume this is an sklearn-compatible model + return FixedModelSelector(input) + + class SearchEstimatorList(BaseEstimator): """ The SearchEstimatorList is a utility class for hyperparameter tuning. - It provides a convenient way to perform GridSearch cross-validation for - a list of estimators. The class automates the process of hyperparameter + It provides a convenient way to perform GridSearch cross-validation for + a list of estimators. The class automates the process of hyperparameter tuning, model fitting, and prediction for multiple estimators. @@ -275,7 +570,8 @@ class SearchEstimatorList(BaseEstimator): A list of names of estimators to be used for grid search. param_grid_list : list or 'auto', default 'auto' - A list of dictionaries specifying hyperparameters for each estimator in `estimator_list`. If set to 'auto', the class automatically generates hyperparameters for the estimators. + A list of dictionaries specifying hyperparameters for each estimator in `estimator_list`. If set to 'auto', + the class automatically generates hyperparameters for the estimators. scaling : bool, default True Indicates whether to scale the input data using StandardScaler. @@ -304,32 +600,35 @@ class SearchEstimatorList(BaseEstimator): random_state : int, RandomState instance, or None, default None If int, `random_state` is the seed used by the random number generator; If `RandomState` instance, `random_state` is the random number generator; - If None, the random number generator is the `RandomState` instance used by `np.random`. Used when `shuffle` == True. + If None, the random number generator is the `RandomState` instance used by `np.random`. + Used when `shuffle` == True. error_score : float or 'raise', default np.nan - The value assigned to the score if an error occurs during fitting an estimator. If set to 'raise', an error is raised. + The value assigned to the score if an error occurs during fitting an estimator. If set to 'raise', + an error is raised. return_train_score : bool, default False Determines whether to include training scores in the `cv_results_` attribute of the class. categorical_indices : str, int, list, or None default None - List of categorical indices + List of categorical indices """ - def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, scaling=False, is_discrete=False, scoring=None, - n_jobs=None, refit=True, cv=2, verbose=2, pre_dispatch='2*n_jobs', random_state=None, + def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, scaling=False, + is_discrete=False, scoring=None, n_jobs=None, refit=True, cv=2, verbose=2, + pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=False, categorical_indices=None): - # pdb.set_trace() self.estimator_list = estimator_list self.complete_estimator_list = get_complete_estimator_list( clone(estimator_list, safe=False), is_discrete=is_discrete, random_state=random_state) - # TODO Add in more functionality by checking if it's an empty list. If it's just 1 dictionary then we're going to need to turn it into a list + # TODO Add in more functionality by checking if it's an empty list. If it's just 1 dictionary + # then we're going to need to turn it into a list # Just do more cases if param_grid_list == 'auto': self.param_grid_list = auto_hyperparameters( estimator_list=self.complete_estimator_list, is_discrete=is_discrete) - elif (param_grid_list == None): + elif (param_grid_list is None): self.param_grid_list = len(self.complete_estimator_list) * [{}] else: if isinstance(param_grid_list, dict): @@ -338,7 +637,7 @@ def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, sc self.param_grid_list = param_grid_list self.categorical_indices = categorical_indices self.scoring = scoring - if scoring == None: + if scoring is None: if is_discrete: self.scoring = 'f1_macro' else: @@ -357,10 +656,6 @@ def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, sc self.supported_models = ['linear', 'forest', 'gbf', 'nnet', 'poly'] def fit(self, X, y, *, sample_weight=None, groups=None): - # print(groups) - # if groups != None: - # pdb.set_trace() - # pdb.set_trace() self._search_list = [] # Change estimators if multi_task @@ -369,7 +664,7 @@ def fit(self, X, y, *, sample_weight=None, groups=None): if not can_handle_multitask(model=estimator, is_discrete=self.is_discrete): self.complete_estimator_list[index] = make_model_multi_task( model=estimator, is_discrete=self.is_discrete) - if self.param_grid_list != None: + if self.param_grid_list is not None: self.param_grid_list[index] = make_param_multi_task( estimator=estimator, param_grid=self.param_grid_list[index]) @@ -381,9 +676,10 @@ def fit(self, X, y, *, sample_weight=None, groups=None): if just_one_model_no_params(estimator_list=self.complete_estimator_list, param_list=self.param_grid_list): # Just fit the model and return it, no need for grid search or for loop estimator = self.complete_estimator_list[0] - if self.random_state != None: + if self.random_state is not None: if has_random_state(model=estimator): - # For a polynomial pipeline, you have to set the random state of the linear part, the polynomial part doesn't have random state + # For a polynomial pipeline, you have to set the random state of the linear part, + # the polynomial part doesn't have random state if is_polynomial_pipeline(estimator): estimator = estimator.set_params(linear__random_state=self.random_state) else: @@ -407,14 +703,15 @@ def fit(self, X, y, *, sample_weight=None, groups=None): else: print(f"Processing estimator: {type(estimator).__name__}") try: - if self.random_state != None: + if self.random_state is not None: if has_random_state(model=estimator): - # For a polynomial pipeline, you have to set the random state of the linear part, the polynomial part doesn't have random state + # For a polynomial pipeline, you have to set the random state of the linear part, + # the polynomial part doesn't have random state if is_polynomial_pipeline(estimator): estimator = estimator.set_params(linear__random_state=self.random_state) else: estimator.set_params(random_state=self.random_state) - # pdb.set_trace() # Note Delete this + temp_search = GridSearchCV(estimator, param_grid, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, cv=self.cv, verbose=self.verbose, pre_dispatch=self.pre_dispatch, error_score=self.error_score, @@ -442,8 +739,10 @@ def fit(self, X, y, *, sample_weight=None, groups=None): warning_msg = f"Warning: {e} for estimator {estimator} and param_grid {param_grid}" warnings.warn(warning_msg, category=UserWarning) if not hasattr(temp_search, 'cv_results_') and not param_grid_is_empty(param_grid=param_grid): - # This warning catches a problem after fit has run with no exception, however if there is no cv_results_ this indicates a failed fit operation. - warning_msg = f"Warning: estimator {estimator} and param_grid {param_grid} failed has no attribute cv_results_." + # This warning catches a problem after fit has run with no exception, + # however if there is no cv_results_ this indicates a failed fit operation. + warning_msg = (f"Warning: estimator {estimator} and param_grid {param_grid} " + "failed, has no attribute cv_results_.") warnings.warn(warning_msg, category=FitFailedWarning) try: self.best_ind_ = np.argmax([search.best_score_ for search in self._search_list]) @@ -453,8 +752,8 @@ def fit(self, X, y, *, sample_weight=None, groups=None): self.best_estimator_ = self._search_list[self.best_ind_].best_estimator_ self.best_score_ = self._search_list[self.best_ind_].best_score_ self.best_params_ = self._search_list[self.best_ind_].best_params_ - print( - f'Best estimator {self.best_estimator_} and best score {self.best_score_} and best params {self.best_params_}') + print(f'Best estimator {self.best_estimator_} and best score {self.best_score_} ' + f'and best params {self.best_params_}') return self def scaler_transform(self, X): @@ -496,20 +795,14 @@ class GridSearchCVList(BaseEstimator): of parameter settings. """ - def __init__(self, estimator_list=['linear', 'forest'], param_grid_list='auto', scoring=None, + def __init__(self, estimator_list, param_grid_list, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', - error_score=np.nan, return_train_score=False, is_discrete=False): - # 'discrete' if is_discrete else 'continuous' - self.estimator_list = get_complete_estimator_list(estimator_list, is_discrete, ) - if param_grid_list == 'auto': - self.param_grid_list = auto_hyperparameters(estimator_list=self.estimator_list, is_discrete=is_discrete) - elif (param_grid_list == None): - self.param_grid_list = len(self.estimator_list) * [{}] - else: - self.param_grid_list = param_grid_list + error_score=np.nan, return_train_score=False): + self.estimator_list = estimator_list + self.param_grid_list = param_grid_list self.scoring = scoring self.n_jobs = n_jobs - # self.refit = refit + self.refit = refit self.cv = cv self.verbose = verbose self.pre_dispatch = pre_dispatch @@ -519,7 +812,7 @@ def __init__(self, estimator_list=['linear', 'forest'], param_grid_list='auto', def fit(self, X, y=None, **fit_params): self._gcv_list = [GridSearchCV(estimator, param_grid, scoring=self.scoring, - n_jobs=self.n_jobs, cv=self.cv, verbose=self.verbose, + n_jobs=self.n_jobs, refit=self.refit, cv=self.cv, verbose=self.verbose, pre_dispatch=self.pre_dispatch, error_score=self.error_score, return_train_score=self.return_train_score) for estimator, param_grid in zip(self.estimator_list, self.param_grid_list)] @@ -529,9 +822,6 @@ def fit(self, X, y=None, **fit_params): self.best_params_ = self._gcv_list[self.best_ind_].best_params_ return self - def best_model(self): - return self.best_estimator_ - def predict(self, X): return self.best_estimator_.predict(X) @@ -539,7 +829,7 @@ def predict_proba(self, X): return self.best_estimator_.predict_proba(X) -def _cross_val_predict(estimator, X, y=None, *, groups=None, cv=3, +def _cross_val_predict(estimator, X, y=None, *, groups=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict', safe=True): """This is a fork from :meth:`~sklearn.model_selection.cross_val_predict` to allow for diff --git a/econml/sklearn_extensions/model_selection_utils.py b/econml/sklearn_extensions/model_selection_utils.py index 477731600..ab3f567d8 100644 --- a/econml/sklearn_extensions/model_selection_utils.py +++ b/econml/sklearn_extensions/model_selection_utils.py @@ -1,5 +1,4 @@ -import pdb import warnings from sklearn.exceptions import NotFittedError import numpy as np @@ -104,7 +103,8 @@ def select_estimator(estimator_type, is_discrete, random_state): Parameters ---------- - estimator_type (str): The type of estimator to use, one of: 'linear', 'forest', 'gbf', 'nnet', 'poly', 'automl', 'all'. + estimator_type (str): The type of estimator to use, one of: 'linear', 'forest', + 'gbf', 'nnet', 'poly', 'automl', 'all'. is_discrete (bool): The type of target variable, if true then it's discrete. TODO Add Random State for parameter Returns @@ -156,7 +156,8 @@ def check_list_type(lst): bool: True if the list only contains valid objects, False otherwise. Raises: - TypeError: If the list contains objects other than strings, sklearn model objects, or sklearn model selection objects. + TypeError: If the list contains objects other than strings, sklearn model objects, + or sklearn model selection objects. Examples: >>> check_list_type(['linear', RandomForestRegressor(), KFold()]) @@ -167,13 +168,12 @@ def check_list_type(lst): if len(lst) == 0: raise ValueError("Estimator list is empty. Please add some models or use some of the defaults provided.") - # pdb.set_trace() for element in lst: if (not isinstance(element, (str, BaseCrossValidator))): if not is_likely_estimator(element): - # pdb.set_trace() raise TypeError( - f"The list must contain only strings, sklearn model objects, and sklearn model selection objects. Invalid element: {element}") + "The list must contain only strings, sklearn model objects, and sklearn model selection objects. " + f"Invalid element: {element}") return True @@ -183,7 +183,8 @@ def get_complete_estimator_list(estimator_list, is_discrete, random_state): Parameters ---------- - estimator_list : List of estimators; can be sklearn object or str: 'linear', 'forest', 'gbf', 'nnet', 'poly', 'auto', 'all'. + estimator_list : List of estimators; can be sklearn object or str: 'linear', 'forest', 'gbf', + 'nnet', 'poly', 'auto', 'all'. is_discrete (bool): if target type is discrete or continuous. Returns @@ -194,7 +195,6 @@ def get_complete_estimator_list(estimator_list, is_discrete, random_state): ValueError: If the estimator is not supported. ''' - # pdb.set_trace() if isinstance(estimator_list, str): if 'all' == estimator_list: estimator_list = ['linear', 'forest', 'gbf', 'nnet', 'poly'] @@ -204,7 +204,8 @@ def get_complete_estimator_list(estimator_list, is_discrete, random_state): estimator_list = [estimator_list] else: raise ValueError( - "Invalid estimator_list value. Please provide a valid value from the list of available estimators: ['linear', 'forest', 'gbf', 'nnet', 'poly', 'automl']") + "Invalid estimator_list value. Please provide a valid value from the list of available estimators: " + "['linear', 'forest', 'gbf', 'nnet', 'poly', 'automl']") elif isinstance(estimator_list, list): if 'auto' in estimator_list: for estimator in ['linear']: @@ -236,11 +237,10 @@ def get_complete_estimator_list(estimator_list, is_discrete, random_state): temp_est_list = flatten_list(temp_est_list) # Check that all types of models are matched towards the problem. - # pdb.set_trace() for estimator in temp_est_list: if (isinstance(estimator, BaseEstimator)): if not is_regressor_or_classifier(estimator, is_discrete=is_discrete): - raise TypeError("Invalid estimator type: {} - must be a regressor or classifier".format(type(estimator))) + raise TypeError(f"Invalid estimator type: {type(estimator)} - must be a regressor or classifier") return temp_est_list @@ -292,7 +292,9 @@ def select_classification_hyperparameters(estimator): 'linear__solver': ['saga', 'lbfgs'] } else: - warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for LogisticRegressionCV, RandomForestClassifier, MLPClassifier, and the polynomial pipleine", category=UserWarning) + warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for " + "LogisticRegressionCV, RandomForestClassifier, MLPClassifier, and the polynomial pipleine", + category=UserWarning) return {} # raise ValueError("Invalid model type. Valid values are 'linear', 'forest', 'nnet', and 'poly'.") @@ -340,7 +342,9 @@ def select_regression_hyperparameters(estimator): 'poly__degree': [2, 3, 4] } else: - warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for ElasticNetCV, RandomForestRegressor, MLPRegressor, and the polynomial pipeline.", category=UserWarning) + warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for " + "ElasticNetCV, RandomForestRegressor, MLPRegressor, and the polynomial pipeline.", + category=UserWarning) return {} @@ -490,7 +494,8 @@ def is_linear_model(estimator): """ Check if a model is a linear model. - This function checks if a model has 'fit_intercept' and 'coef_' attributes or if it is an instance of LogisticRegression, LinearSVC, or SVC. + This function checks if a model has 'fit_intercept' and 'coef_' attributes or if it is an instance of + LogisticRegression, LinearSVC, or SVC. Parameters ---------- @@ -521,7 +526,8 @@ def is_data_scaled(X): """ Check if input data is scaled. - This function checks if the input data is scaled by comparing its mean and standard deviation to 0 and 1 respectively. + This function checks if the input data is scaled by comparing its mean and standard deviation to + 0 and 1 respectively. Parameters ---------- @@ -754,7 +760,8 @@ def make_param_multi_task(estimator, param_grid): """ Convert the keys in a parameter grid to work with a multi-task model. - This function converts the keys in a parameter grid to work with a multi-task model by prepending 'estimator__' to each key. + This function converts the keys in a parameter grid to work with a multi-task model by prepending + 'estimator__' to each key. Parameters ---------- diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index afb445ccd..2f321c5f2 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -22,9 +22,7 @@ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.multioutput import MultiOutputRegressor from econml.grf import MultiOutputGRF -from econml.sklearn_extensions.model_selection import SearchEstimatorList from econml.tests.utilities import (GroupingModel, NestedModel) -import pdb try: import ray @@ -625,9 +623,9 @@ def test_access_to_internal_models(self): assert isinstance(est.featurizer_, Pipeline) assert isinstance(est.model_cate, WeightedLasso) for mdl in est.models_y[0]: - assert isinstance(mdl, SearchEstimatorList) + assert isinstance(mdl, WeightedLasso) for mdl in est.models_t[0]: - assert isinstance(mdl, SearchEstimatorList) + assert isinstance(mdl, LogisticRegression) np.testing.assert_array_equal(est.cate_feature_names(['A']), ['A', 'A^2']) np.testing.assert_array_equal(est.cate_feature_names(), ['X0', 'X0^2']) est = DML(model_y=WeightedLasso(), @@ -641,9 +639,9 @@ def test_access_to_internal_models(self): assert isinstance(est.featurizer_, FunctionTransformer) assert isinstance(est.model_cate, WeightedLasso) for mdl in est.models_y[0]: - assert isinstance(mdl, SearchEstimatorList) + assert isinstance(mdl, WeightedLasso) for mdl in est.models_t[0]: - assert isinstance(mdl, SearchEstimatorList) + assert isinstance(mdl, LogisticRegression) np.testing.assert_array_equal(est.cate_feature_names(['A']), ['A']) def test_forest_dml_perf(self): @@ -1131,7 +1129,6 @@ def _test_sparse(n_p, d_w, n_r): model_t=LinearRegression(fit_intercept=False), fit_cate_intercept=False) dml.fit(y, t, X=x, W=w) - # pdb.set_trace() np.testing.assert_allclose(a, dml.coef_.reshape(-1), atol=1e-1) eff = reshape(t * np.choose(np.tile(p, 2), a), (-1,)) np.testing.assert_allclose(eff, dml.effect(x, T0=0, T1=t), atol=1e-1) @@ -1239,8 +1236,8 @@ def test_groups(self): # test outer grouping # with 2 folds, we should get exactly 3 groups per split, each with 10 copies of the y or t value - est = LinearDML(model_y=GroupingModel(LinearRegression(), (3, 3), n_copies), - model_t=GroupingModel(LinearRegression(), (3, 3), n_copies)) + est = LinearDML(model_y=GroupingModel(LinearRegression(), 60, (3, 3), n_copies), + model_t=GroupingModel(LinearRegression(), 60, (3, 3), n_copies)) est.fit(y, t, groups=groups) # test nested grouping @@ -1248,17 +1245,10 @@ def test_groups(self): # with 2-fold outer and 2-fold inner grouping, and six total groups, # should get 1 or 2 groups per split - est = LinearDML(model_y=NestedModel(LassoCV(cv=2), (1, 2), n_copies), - model_t=NestedModel(LassoCV(cv=2), (1, 2), n_copies)) + est = LinearDML(model_y=NestedModel(LassoCV(cv=2), 60, (1, 2), n_copies), + model_t=NestedModel(LassoCV(cv=2), 60, (1, 2), n_copies)) est.fit(y, t, groups=groups) - # by default, we use 5 split cross-validation for our T and Y models - # but we don't have enough groups here to split both the outer and inner samples with grouping - # TODO: does this imply we should change some defaults to make this more likely to succeed? - est = LinearDML(model_y=LassoCV(cv=5), model_t=LassoCV(cv=5)) - with pytest.raises(Exception): - est.fit(y, t, groups=groups) - def test_treatment_names(self): Y = np.random.normal(size=(100, 1)) T = np.random.binomial(n=1, p=0.5, size=(100, 1)) diff --git a/econml/tests/test_dmliv.py b/econml/tests/test_dmliv.py index f52c14356..16f8f55a9 100644 --- a/econml/tests/test_dmliv.py +++ b/econml/tests/test_dmliv.py @@ -207,7 +207,7 @@ def test_groups(self): projection=False, discrete_treatment=True, discrete_instrument=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims, n_copies), model_t_xw=LogisticRegression(), model_z_xw=LogisticRegression(), ), @@ -215,7 +215,7 @@ def test_groups(self): projection=True, discrete_treatment=True, discrete_instrument=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims, n_copies), model_t_xw=LogisticRegression(), model_t_xwz=LogisticRegression(), ), @@ -223,7 +223,7 @@ def test_groups(self): model_final=LinearRegression(fit_intercept=False), discrete_treatment=True, discrete_instrument=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims, n_copies), model_t_xw=LogisticRegression(), model_t_xwz=LogisticRegression(), ), @@ -231,7 +231,7 @@ def test_groups(self): model_final=RandomForestRegressor(), discrete_treatment=True, discrete_instrument=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims, n_copies), model_t_xw=LogisticRegression(), model_t_xwz=LogisticRegression(), ), diff --git a/econml/tests/test_driv.py b/econml/tests/test_driv.py index 39b90c1ed..38bb8421a 100644 --- a/econml/tests/test_driv.py +++ b/econml/tests/test_driv.py @@ -13,7 +13,7 @@ import pickle from scipy import special from sklearn.preprocessing import PolynomialFeatures -from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegression import unittest try: @@ -281,7 +281,10 @@ def test_accuracy_without_ray(self): def test_fit_cov_directly(self): # fitting the covariance directly should be at least as good as computing the covariance from separate models - est = LinearDRIV() + + # set the models so that model selection over random forests doesn't take too much time in the repeated trials + est = LinearDRIV(model_y_xw=LassoCV(), model_t_xw=LassoCV(), model_z_xw=LassoCV(), + model_tz_xw=LassoCV()) n = 500 p = 10 @@ -334,8 +337,8 @@ def ceil(a, b): # ceiling analog of // DRIV( discrete_instrument=True, discrete_treatment=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims_2, n_copies), - model_z_xw=LinearRegression(), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_2, n_copies), + model_z_xw=LogisticRegression(), model_t_xw=LogisticRegression(), model_tz_xw=LinearRegression(), model_t_xwz=LogisticRegression(), @@ -344,8 +347,8 @@ def ceil(a, b): # ceiling analog of // LinearDRIV( discrete_instrument=True, discrete_treatment=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims_2, n_copies), - model_z_xw=LinearRegression(), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_2, n_copies), + model_z_xw=LogisticRegression(), model_t_xw=LogisticRegression(), model_tz_xw=LinearRegression(), model_t_xwz=LogisticRegression(), @@ -354,8 +357,8 @@ def ceil(a, b): # ceiling analog of // SparseLinearDRIV( discrete_instrument=True, discrete_treatment=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims_2, n_copies), - model_z_xw=LinearRegression(), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_2, n_copies), + model_z_xw=LogisticRegression(), model_t_xw=LogisticRegression(), model_tz_xw=LinearRegression(), model_t_xwz=LogisticRegression(), @@ -364,20 +367,20 @@ def ceil(a, b): # ceiling analog of // ForestDRIV( discrete_instrument=True, discrete_treatment=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims_2, n_copies), - model_z_xw=LinearRegression(), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_2, n_copies), + model_z_xw=LogisticRegression(), model_t_xw=LogisticRegression(), model_tz_xw=LinearRegression(), model_t_xwz=LogisticRegression(), prel_cate_approach='dmliv' ), IntentToTreatDRIV( - model_y_xw=GroupingModel(LinearRegression(), ct_lims_3, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_3, n_copies), model_t_xwz=LogisticRegression(), prel_cate_approach='dmliv' ), LinearIntentToTreatDRIV( - model_y_xw=GroupingModel(LinearRegression(), ct_lims_3, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_3, n_copies), model_t_xwz=LogisticRegression(), prel_cate_approach='dmliv' ) diff --git a/econml/tests/test_drlearner.py b/econml/tests/test_drlearner.py index f6a5e4ae8..3d3e982a9 100644 --- a/econml/tests/test_drlearner.py +++ b/econml/tests/test_drlearner.py @@ -828,26 +828,17 @@ def test_groups(self): # cross-fit generate one est = LinearDRLearner(model_propensity=LogisticRegression(), # with 2-fold grouping, we should get exactly 3 groups per split - model_regression=GroupingModel(LinearRegression(), (3, 3), n_copies), + model_regression=GroupingModel(LinearRegression(), 60, (3, 3), n_copies), cv=StratifiedGroupKFold(2)) est.fit(y, t, W=w, groups=groups) # test nested grouping est = LinearDRLearner(model_propensity=LogisticRegression(), # with 2-fold outer and 2-fold inner grouping, we should get 1-2 groups per split - model_regression=NestedModel(LassoCV(cv=2), (1, 2), n_copies), + model_regression=NestedModel(LassoCV(cv=2), 60, (1, 2), n_copies), cv=StratifiedGroupKFold(2)) est.fit(y, t, W=w, groups=groups) - # by default, we use 5 split cross-validation for our T and Y models - # but we don't have enough groups here to split both the outer and inner samples with grouping - # TODO: does this imply we should change some defaults to make this more likely to succeed? - est = LinearDRLearner(model_propensity=LogisticRegressionCV(cv=5), - model_regression=LassoCV(cv=5), - cv=StratifiedGroupKFold(2)) - with pytest.raises(Exception): - est.fit(y, t, W=w, groups=groups) - def test_score(self): """Test that scores are the same no matter whether the prediction of cate model has the same shape of input or the shape of input.reshape(-1,1).""" diff --git a/econml/tests/test_missing_values.py b/econml/tests/test_missing_values.py index 66d917f76..eb1c4f7e4 100644 --- a/econml/tests/test_missing_values.py +++ b/econml/tests/test_missing_values.py @@ -27,7 +27,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self diff --git a/econml/tests/test_ortho_learner.py b/econml/tests/test_ortho_learner.py index 66c389ae0..142f23563 100644 --- a/econml/tests/test_ortho_learner.py +++ b/econml/tests/test_ortho_learner.py @@ -29,7 +29,7 @@ class Wrapper: def __init__(self, model): self._model = model - def fit(self, X, y, Q, W=None): + def train(self, is_selecting, X, y, Q, W=None): self._model.fit(X, y) return self @@ -109,7 +109,7 @@ class Wrapper: def __init__(self, model): self._model = model - def fit(self, X, y, W=None): + def train(self, is_selecting, X, y, W=None): self._model.fit(X, y) return self @@ -179,7 +179,7 @@ class Wrapper: def __init__(self, model): self._model = model - def fit(self, X, y, Q, W=None): + def train(self, is_selecting, X, y, Q, W=None): self._model.fit(X, y) return self @@ -219,7 +219,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self @@ -331,7 +331,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self @@ -378,7 +378,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self @@ -434,7 +434,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, np.matmul(T, np.arange(1, T.shape[1] + 1))) self._model_y.fit(W, Y) return self diff --git a/econml/tests/test_refit.py b/econml/tests/test_refit.py index 9cf93334c..00cc81dff 100644 --- a/econml/tests/test_refit.py +++ b/econml/tests/test_refit.py @@ -188,9 +188,9 @@ def test_orthoiv(self): est.model_t_xw = ElasticNet() est.model_z_xw = WeightedLasso() est.fit(y, T, Z=Z, W=W, cache_values=True) - assert isinstance(est.models_nuisance_[0][0]._model_y_xw._model, Lasso) - assert isinstance(est.models_nuisance_[0][0]._model_t_xw._model, ElasticNet) - assert isinstance(est.models_nuisance_[0][0]._model_z_xw._model, WeightedLasso) + assert isinstance(est.models_y_xw[0][0], Lasso) + assert isinstance(est.models_t_xw[0][0], ElasticNet) + assert isinstance(est.models_z_xw[0][0], WeightedLasso) est = DMLIV(model_y_xw=LinearRegression(), model_t_xw=LinearRegression(), @@ -202,9 +202,9 @@ def test_orthoiv(self): est.model_t_xw = ElasticNet() est.model_t_xwz = WeightedLasso() est.fit(y, T, Z=Z, X=X, W=W, cache_values=True) - assert isinstance(est.models_nuisance_[0][0]._model_y_xw._model, Lasso) - assert isinstance(est.models_nuisance_[0][0]._model_t_xw._model, ElasticNet) - assert isinstance(est.models_nuisance_[0][0]._model_t_xwz._model, WeightedLasso) + assert isinstance(est.models_y_xw[0][0], Lasso) + assert isinstance(est.models_t_xw[0][0], ElasticNet) + assert isinstance(est.models_t_xwz[0][0], WeightedLasso) est = NonParamDMLIV(model_y_xw=LinearRegression(), model_t_xw=LinearRegression(), diff --git a/econml/tests/utilities.py b/econml/tests/utilities.py index 4c04cc89d..1c11be343 100644 --- a/econml/tests/utilities.py +++ b/econml/tests/utilities.py @@ -16,15 +16,17 @@ class GroupingModel: and the number of copies of each y value should be equal to the group size """ - def __init__(self, model, limits, n_copies): + def __init__(self, model, total, limits, n_copies): self.model = model + self.total = total self.limits = limits self.n_copies = n_copies - def validate(self, y): + def validate(self, y, skip_group_counts=False): (yvals, cts) = np.unique(y, return_counts=True) (llim, ulim) = self.limits - if not (llim <= len(yvals) <= ulim): + # if we aren't fitting on the whole dataset, ensure that the limits are respected + if (not skip_group_counts) and (not (llim <= len(yvals) <= ulim)): raise Exception(f"Grouping failed: received {len(yvals)} groups instead of {llim}-{ulim}") # ensure that the grouping has worked correctly and we get exactly the number of copies @@ -35,7 +37,7 @@ def validate(self, y): f"Grouping failed; received {ct} copies of {yval} instead of {self.n_copies[yval]}") def fit(self, X, y): - self.validate(y) + self.validate(y, len(y) == self.total) self.model.fit(X, y) return self @@ -46,12 +48,9 @@ def predict(self, X): class NestedModel(GroupingModel): """ Class for testing nested grouping. The wrapped model must have a 'cv' attribute; - this class exposes an identical 'cv' attribute, which is how nested CV is implemented in fit_with_groups + this class exposes an identical 'cv' attribute, which is how nested CV is implemented in _fit_with_groups """ - def __init__(self, model, limits, n_copies): - super().__init__(model, limits, n_copies) - # DML nested CV works via a 'cv' attribute @property def cv(self): @@ -64,6 +63,6 @@ def cv(self, value): def fit(self, X, y): for (train, test) in check_cv(self.cv, y).split(X, y): # want to validate the nested grouping, not the outer grouping in the nesting tests - self.validate(y[train]) + self.validate(y[train], len(y) == self.total) self.model.fit(X, y) return self diff --git a/econml/utilities.py b/econml/utilities.py index 008bfc244..f62ffbb4d 100644 --- a/econml/utilities.py +++ b/econml/utilities.py @@ -21,7 +21,6 @@ from sklearn.preprocessing import PolynomialFeatures import warnings from warnings import warn -from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold from collections.abc import Iterable from sklearn.utils.multiclass import type_of_target import numbers @@ -30,7 +29,6 @@ from statsmodels.compat.python import lmap import copy from inspect import signature -from econml.sklearn_extensions.model_selection import SearchEstimatorList MAX_RAND_SEED = np.iinfo(np.int32).max @@ -920,78 +918,6 @@ def filter_inds(coords, data, n): [arrs[indMap[c][0][0]].shape[indMap[c][0][1]] for c in outputs]) -def fit_with_groups(model, X, y, groups=None, **kwargs): - """ - Fit a model while correctly handling grouping if necessary. - - This enables us to perform an inner-loop cross-validation of a model - which handles grouping correctly, which is not easy using typical sklearn models. - - For example, GridSearchCV and RandomSearchCV both support passing 'groups' to fit, - but other CV-related estimators (such as those derived from LinearModelCV, including LassoCV), - do not support passing groups to fit which meanst that GroupKFold cannot be used as the cv instance - when using these types, because the required 'groups' argument will never be passed to the - GroupKFold's split method. See also https://github.com/scikit-learn/scikit-learn/issues/12052 - - The (hacky) workaround that is used here is to explicitly set the 'cv' attribute (if there is one) to - the exact set of rows and not to use GroupKFold even with the sklearn classes that could support it; - this should work with classes derived from BaseSearchCV, LinearModelCV, and CalibratedClassifierCV. - - Parameters - ---------- - model : estimator - The model to fit - X : array_like - The features to fit against - y : array_like - The target to fit against - groups : array_like, optional - The set of groupings that should be kept together when splitting rows for - cross-validation - kwargs : dict - Any other named arguments to pass to the model's fit - """ - # import pdb - # pdb.set_trace() - if groups is not None: - if isinstance(model, SearchEstimatorList): - # SearchEstimatorList must be handled different. Each estimator must be changed for CV else the functionality isn't the same - # It does have a CV but it does not work if you just change the CV of the SearchEstimatorList - for estimator in model.complete_estimator_list: - if hasattr(estimator, 'cv'): - old_cv = estimator.cv - cv = 5 if old_cv is None else old_cv - if isinstance(cv, numbers.Integral): - cv = GroupKFold(cv) - splits = list(cv.split(X, y, groups=groups)) - try: - estimator.cv = splits - except: - estimator.cv = old_cv - # assume that we should perform nested cross-validation if and only if - # the model has a 'cv' attribute; this is a somewhat brittle assumption... - elif hasattr(model, 'cv'): - old_cv = model.cv - # logic copied from check_cv - cv = 5 if old_cv is None else old_cv - if isinstance(cv, numbers.Integral): - cv = GroupKFold(cv) - # otherwise we will assume the user already set the cv attribute to something - # compatible with splitting with a 'groups' argument - - # now we have to compute the folds explicitly because some classifiers (like LassoCV) - # don't use the groups when calling split internally - splits = list(cv.split(X, y, groups=groups)) - try: - print(splits) - model.cv = splits - return model.fit(X, y, **kwargs) - finally: - model.cv = old_cv - - return model.fit(X, y, **kwargs) - - def filter_none_kwargs(**kwargs): """ Filters out any keyword arguments that are None. diff --git a/notebooks/SearchEstimatorList functionality.ipynb b/notebooks/SearchEstimatorList functionality.ipynb deleted file mode 100644 index 4464199de..000000000 --- a/notebooks/SearchEstimatorList functionality.ipynb +++ /dev/null @@ -1,1031 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Import necessary packages\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import mean_squared_error, accuracy_score\n", - "from sklearn.datasets import load_iris\n", - "from econml.sklearn_extensions.model_selection import SearchEstimatorList\n", - "import warnings\n", - "import numpy as np\n", - "from econml.dml import LinearDML, CausalForestDML\n", - "from econml.cate_interpreter import SingleTreeCateInterpreter, SingleTreePolicyInterpreter\n", - "import pandas as pd\n", - "from sklearn.preprocessing import PolynomialFeatures\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.exceptions import ConvergenceWarning\n", - "\n", - "# Ignore the ConvergenceWarning\n", - "warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n", - "\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# SearchEstimatorList\n", - "\n", - "The SearchEstimatorList class is a custom Python class designed to streamline the process of training multiple machine learning models and tuning their hyperparameters. This class can be especially useful when you're unsure which model will perform best on your data and you want to compare several of them.\n", - "\n", - "# Key Features\n", - "\n", - " Multiple Model Training: The SearchEstimatorList class takes a list of Scikit-learn estimators (machine learning models) and trains each of them on your data.\n", - "\n", - " Hyperparameter Tuning: For each model, the class conducts a grid search over a provided range of hyperparameters. This allows you to automatically find the hyperparameters that result in the best model performance.\n", - "\n", - " Model Evaluation: The class retains the best performing model based on a specified scoring metric. This makes it easy to determine which model and hyperparameters are the most suitable for your data.\n", - "\n", - " Data Scaling: The SearchEstimatorList class also supports data scaling, which can be important for certain types of models.\n", - "\n", - " Handling of Different Target Types: This class handles both continuous and discrete target variables, making it suitable for both regression and classification tasks.\n", - "\n", - "# Usage\n", - "\n", - "To use the SearchEstimatorList class, you start by initializing an instance of the class with a list of models and their corresponding hyperparameter grids. Then, you call the fit method to train the models and conduct the grid search. After fitting, you can use the predict method to generate predictions for new data. The class also has methods to refit the best model using the entire dataset (refit) and to return the best model (best_model)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Classifier" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "No scoring value was given. Using default score method f1_macro.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 2 folds for each of 3 candidates, totalling 6 fits\n", - "[CV 1/2] END ...................n_estimators=50;, score=0.916 total time= 0.1s\n", - "[CV 2/2] END ...................n_estimators=50;, score=0.950 total time= 0.1s\n", - "[CV 1/2] END ..................n_estimators=100;, score=0.916 total time= 0.1s\n", - "[CV 2/2] END ..................n_estimators=100;, score=0.950 total time= 0.1s\n", - "[CV 1/2] END ..................n_estimators=150;, score=0.916 total time= 0.1s\n", - "[CV 2/2] END ..................n_estimators=150;, score=0.950 total time= 0.1s\n", - "Fitting 2 folds for each of 9 candidates, totalling 18 fits\n", - "[CV 1/2] END learning_rate=0.01, n_estimators=50;, score=0.900 total time= 0.0s\n", - "[CV 2/2] END learning_rate=0.01, n_estimators=50;, score=0.950 total time= 0.0s\n", - "[CV 1/2] END learning_rate=0.01, n_estimators=100;, score=0.900 total time= 0.0s\n", - "[CV 2/2] END learning_rate=0.01, n_estimators=100;, score=0.950 total time= 0.1s\n", - "[CV 1/2] END learning_rate=0.01, n_estimators=150;, score=0.900 total time= 0.1s\n", - "[CV 2/2] END learning_rate=0.01, n_estimators=150;, score=0.950 total time= 0.1s\n", - "[CV 1/2] END learning_rate=0.1, n_estimators=50;, score=0.900 total time= 0.0s\n", - "[CV 2/2] END learning_rate=0.1, n_estimators=50;, score=0.950 total time= 0.0s\n", - "[CV 1/2] END learning_rate=0.1, n_estimators=100;, score=0.900 total time= 0.1s\n", - "[CV 2/2] END learning_rate=0.1, n_estimators=100;, score=0.933 total time= 0.1s\n", - "[CV 1/2] END learning_rate=0.1, n_estimators=150;, score=0.900 total time= 0.1s\n", - "[CV 2/2] END learning_rate=0.1, n_estimators=150;, score=0.933 total time= 0.1s\n", - "[CV 1/2] END ..learning_rate=1, n_estimators=50;, score=0.900 total time= 0.0s\n", - "[CV 2/2] END ..learning_rate=1, n_estimators=50;, score=0.933 total time= 0.0s\n", - "[CV 1/2] END .learning_rate=1, n_estimators=100;, score=0.900 total time= 0.1s\n", - "[CV 2/2] END .learning_rate=1, n_estimators=100;, score=0.933 total time= 0.1s\n", - "[CV 1/2] END .learning_rate=1, n_estimators=150;, score=0.900 total time= 0.1s\n", - "[CV 2/2] END .learning_rate=1, n_estimators=150;, score=0.933 total time= 0.1s\n", - "Best estimator RandomForestClassifier(n_estimators=50) and best score 0.9330819977445048 and best params {'n_estimators': 50}\n", - "Accuracy: 1.0\n" - ] - } - ], - "source": [ - "# Load the Iris dataset for classification\n", - "iris = load_iris()\n", - "\n", - "# Split the dataset into training and test sets\n", - "X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(\n", - " iris.data, iris.target, test_size=0.2, random_state=42\n", - ")\n", - "\n", - "# Define models and their parameter grids\n", - "estimator_list_cls = ['forest', 'gbf']\n", - "param_grid_list_cls = [{'n_estimators': [50, 100, 150]}, {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 1]}]\n", - "\n", - "# Initialize SearchEstimatorList\n", - "sel_cls = SearchEstimatorList(\n", - " estimator_list=estimator_list_cls, \n", - " param_grid_list=param_grid_list_cls, \n", - " is_discrete=True,\n", - " verbose=3\n", - ")\n", - "\n", - "# Fit the model to the training data\n", - "sel_cls.fit(X_train_cls, y_train_cls)\n", - "\n", - "# Predict outcomes for the test set\n", - "predictions_cls = sel_cls.predict(X_test_cls)\n", - "\n", - "# Evaluate the model\n", - "acc = accuracy_score(y_test_cls, predictions_cls)\n", - "\n", - "# Print the evaluation metric\n", - "print(f\"Accuracy: {acc}\")\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Regressor" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 2 folds for each of 7 candidates, totalling 14 fits\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/anthonycampbell/Documents/EconML-CS696DS/econml/sklearn_extensions/model_selection.py:346: UserWarning: No scoring value was given. Using default score method neg_mean_squared_error.\n", - " warnings.warn(f\"No scoring value was given. Using default score method {self.scoring}.\")\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV 1/2] END .....................l1_ratio=0.1;, score=-0.584 total time= 0.0s\n", - "[CV 2/2] END .....................l1_ratio=0.1;, score=-0.725 total time= 0.0s\n", - "[CV 1/2] END .....................l1_ratio=0.5;, score=-0.549 total time= 0.0s\n", - "[CV 2/2] END .....................l1_ratio=0.5;, score=-0.675 total time= 0.0s\n", - "[CV 1/2] END .....................l1_ratio=0.7;, score=-0.546 total time= 0.0s\n", - "[CV 2/2] END .....................l1_ratio=0.7;, score=-0.668 total time= 0.0s\n", - "[CV 1/2] END .....................l1_ratio=0.9;, score=-0.544 total time= 0.0s\n", - "[CV 2/2] END .....................l1_ratio=0.9;, score=-0.663 total time= 0.0s\n", - "[CV 1/2] END ....................l1_ratio=0.95;, score=-0.544 total time= 0.0s\n", - "[CV 2/2] END ....................l1_ratio=0.95;, score=-0.662 total time= 0.0s\n", - "[CV 1/2] END ....................l1_ratio=0.99;, score=-0.544 total time= 0.0s\n", - "[CV 2/2] END ....................l1_ratio=0.99;, score=-0.661 total time= 0.0s\n", - "[CV 1/2] END .......................l1_ratio=1;, score=-0.544 total time= 0.0s\n", - "[CV 2/2] END .......................l1_ratio=1;, score=-0.661 total time= 0.0s\n", - "Fitting 2 folds for each of 3 candidates, totalling 6 fits\n", - "[CV 1/2] END ............hidden_layer_sizes=50;, score=-0.712 total time= 1.0s\n", - "[CV 2/2] END ............hidden_layer_sizes=50;, score=-0.580 total time= 1.3s\n", - "[CV 1/2] END ...........hidden_layer_sizes=100;, score=-0.695 total time= 0.8s\n", - "[CV 2/2] END ...........hidden_layer_sizes=100;, score=-2.334 total time= 1.0s\n", - "[CV 1/2] END ...........hidden_layer_sizes=200;, score=-0.641 total time= 8.1s\n", - "[CV 2/2] END ...........hidden_layer_sizes=200;, score=-1.162 total time= 5.4s\n", - "Best estimator ElasticNetCV(l1_ratio=1) and best score -0.6025662427788023 and best params {'l1_ratio': 1}\n", - "Mean Squared Error: 0.5555752649052167\n" - ] - } - ], - "source": [ - "# Import necessary packages\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import mean_squared_error, accuracy_score\n", - "from sklearn.datasets import fetch_california_housing\n", - "from econml.sklearn_extensions.model_selection import SearchEstimatorList\n", - "\n", - "# Load the Boston Housing dataset for regression\n", - "california_housing = fetch_california_housing()\n", - "\n", - "# Split the dataset into training and test sets\n", - "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(\n", - " california_housing.data, california_housing.target, test_size=0.2, random_state=42\n", - ")\n", - "\n", - "# Define models and their parameter grids\n", - "# This will use ElasticNet because it's a Linear Model and a Neural Network Regressor\n", - "estimator_list_reg = ['linear', 'nnet']\n", - "param_grid_list_reg = [{'l1_ratio': [.1, .5, .7, .9, .95, .99, 1]}, {'hidden_layer_sizes': [50, 100, 200]}]\n", - "\n", - "# Initialize SearchEstimatorList\n", - "sel_reg = SearchEstimatorList(\n", - " estimator_list=estimator_list_reg, \n", - " param_grid_list=param_grid_list_reg,\n", - " is_discrete=False,\n", - " verbose=3\n", - ")\n", - "\n", - "# Fit the model to the training data\n", - "sel_reg.fit(X_train_reg, y_train_reg)\n", - "\n", - "# Predict outcomes for the test set\n", - "predictions_reg = sel_reg.predict(X_test_reg)\n", - "\n", - "# Evaluate the model\n", - "mse = mean_squared_error(y_test_reg, predictions_reg)\n", - "\n", - "# Print the evaluation metric\n", - "print(f\"Mean Squared Error: {mse}\")\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Using all estimators" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/anthonycampbell/Documents/EconML-CS696DS/econml/sklearn_extensions/model_selection.py:346: UserWarning: No scoring value was given. Using default score method f1_macro.\n", - " warnings.warn(f\"No scoring value was given. Using default score method {self.scoring}.\")\n" - ] - } - ], - "source": [ - "search = SearchEstimatorList(estimator_list = ['linear', 'forest', 'gbf', 'nnet', 'poly'], is_discrete=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Single Estimators and Model Objects" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best estimator LogisticRegression(C=0.001, max_iter=50, penalty='none', solver='sag') and best score 0.966624895572264 and best params {'C': 0.001, 'max_iter': 50, 'penalty': 'none', 'solver': 'sag'}\n", - "LogisticRegression(C=0.001, max_iter=50, penalty='none', solver='sag')\n", - "{'C': 0.001, 'max_iter': 50, 'penalty': 'none', 'solver': 'sag'}\n", - "mse of test dataset: 0.0\n", - "[[7.30818687e-04 9.18278306e-01 8.09908750e-02]\n", - " [9.96517769e-01 3.48223146e-03 9.52705844e-13]\n", - " [8.11833119e-11 2.27064968e-04 9.99772935e-01]\n", - " [1.49082115e-03 8.82474441e-01 1.16034738e-01]\n", - " [6.61814371e-04 9.57060549e-01 4.22776371e-02]\n", - " [9.94291457e-01 5.70854348e-03 8.51181731e-12]\n", - " [3.09570872e-02 9.66175329e-01 2.86758338e-03]\n", - " [1.03620286e-04 2.72711857e-01 7.27184523e-01]\n", - " [1.86273814e-04 5.89659675e-01 4.10154051e-01]\n", - " [7.89829063e-03 9.84383361e-01 7.71834853e-03]\n", - " [1.79967697e-04 3.80342060e-01 6.19477972e-01]\n", - " [9.87625715e-01 1.23742845e-02 6.37903013e-11]\n", - " [9.97989545e-01 2.01045508e-03 2.71212460e-13]\n", - " [9.87073806e-01 1.29261936e-02 5.68033322e-11]\n", - " [9.97732149e-01 2.26785067e-03 1.43489213e-12]\n", - " [2.40047637e-03 9.42313621e-01 5.52859030e-02]\n", - " [1.40979957e-07 5.60447914e-03 9.94395380e-01]\n", - " [4.57991768e-03 9.78714479e-01 1.67056034e-02]\n", - " [1.07687184e-03 8.47974601e-01 1.50948527e-01]\n", - " [1.55738075e-07 5.44482660e-03 9.94555018e-01]\n", - " [9.84143440e-01 1.58565593e-02 2.21243624e-10]\n", - " [1.96353775e-04 3.77725182e-01 6.22078464e-01]\n", - " [9.90664487e-01 9.33551321e-03 6.98033897e-11]\n", - " [2.52736850e-07 8.46501225e-03 9.91534735e-01]\n", - " [1.95677109e-05 4.08891407e-01 5.91089025e-01]\n", - " [1.72461836e-05 8.83781623e-02 9.11604592e-01]\n", - " [1.09118029e-07 1.18285926e-02 9.88171298e-01]\n", - " [3.31801168e-07 1.03342423e-02 9.89665426e-01]\n", - " [9.86532115e-01 1.34678849e-02 1.68835118e-10]\n", - " [9.80493031e-01 1.95069688e-02 2.80655184e-10]]\n" - ] - } - ], - "source": [ - "with warnings.catch_warnings():\n", - " warnings.simplefilter(\"ignore\")\n", - "\n", - " from sklearn.linear_model import LogisticRegression\n", - " lr_param_grid = {\n", - " 'penalty': ['l1', 'l2', 'elasticnet', 'none'],\n", - " 'C': [0.001, 0.01, 0.1, 1, 10, 100],\n", - " 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],\n", - " 'max_iter': [50, 100, 200, 500],\n", - " }\n", - "\n", - " search = SearchEstimatorList(estimator_list = LogisticRegression(), param_grid_list= lr_param_grid, verbose=0, is_discrete=True)\n", - " search.fit(X_train_cls, y_train_cls)\n", - " print(search.best_model())\n", - " print(search.best_params_)\n", - " y_pred = search.predict(X_test_cls)\n", - "\n", - " mse = mean_squared_error(y_test_cls, y_pred)\n", - "\n", - "print(\"mse of test dataset:\", mse,)\n", - "print(search.predict_proba(X_test_cls))\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Polynomial Feature\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 2 folds for each of 9 candidates, totalling 18 fits\n", - "[CV 1/2] END linear__l1_ratio=0.1, poly__degree=2;, score=0.322 total time= 0.3s\n", - "[CV 2/2] END linear__l1_ratio=0.1, poly__degree=2;, score=0.287 total time= 0.2s\n", - "[CV 1/2] END linear__l1_ratio=0.1, poly__degree=3;, score=0.000 total time= 0.3s\n", - "[CV 2/2] END linear__l1_ratio=0.1, poly__degree=3;, score=0.014 total time= 0.3s\n", - "[CV 1/2] END linear__l1_ratio=0.1, poly__degree=4;, score=0.000 total time= 1.0s\n", - "[CV 2/2] END linear__l1_ratio=0.1, poly__degree=4;, score=-0.000 total time= 1.1s\n", - "[CV 1/2] END linear__l1_ratio=0.5, poly__degree=2;, score=0.322 total time= 0.3s\n", - "[CV 2/2] END linear__l1_ratio=0.5, poly__degree=2;, score=0.287 total time= 0.2s\n", - "[CV 1/2] END linear__l1_ratio=0.5, poly__degree=3;, score=0.000 total time= 0.3s\n", - "[CV 2/2] END linear__l1_ratio=0.5, poly__degree=3;, score=0.014 total time= 0.4s\n", - "[CV 1/2] END linear__l1_ratio=0.5, poly__degree=4;, score=0.000 total time= 1.5s\n", - "[CV 2/2] END linear__l1_ratio=0.5, poly__degree=4;, score=-0.000 total time= 1.3s\n", - "[CV 1/2] END linear__l1_ratio=0.9, poly__degree=2;, score=0.322 total time= 0.2s\n", - "[CV 2/2] END linear__l1_ratio=0.9, poly__degree=2;, score=0.287 total time= 0.2s\n", - "[CV 1/2] END linear__l1_ratio=0.9, poly__degree=3;, score=0.000 total time= 0.3s\n", - "[CV 2/2] END linear__l1_ratio=0.9, poly__degree=3;, score=0.014 total time= 0.4s\n", - "[CV 1/2] END linear__l1_ratio=0.9, poly__degree=4;, score=0.000 total time= 1.1s\n", - "[CV 2/2] END linear__l1_ratio=0.9, poly__degree=4;, score=-0.000 total time= 1.1s\n", - "Best estimator Pipeline(steps=[('poly', PolynomialFeatures()),\n", - " ('linear', ElasticNetCV(l1_ratio=0.9))]) and best score 0.30443941337924607 and best params {'linear__l1_ratio': 0.9, 'poly__degree': 2}\n", - "Mean Squared Error: 0.8894038237145269\n" - ] - } - ], - "source": [ - "with warnings.catch_warnings():\n", - " warnings.simplefilter(\"ignore\")\n", - " # For polynomial, please ensure that you have \"poly__\" (two \"_\" or underscores after poly) underneath to change degree\n", - " # To change the linear method please add \"linear__\" (two \"_\" or underscores after linear)\n", - " param_grid_list_poly = {'poly__degree': [2, 3, 4], 'linear__l1_ratio': [0.1, 0.5, 0.9]}\n", - " sel_reg = SearchEstimatorList(\n", - " estimator_list='poly', \n", - " param_grid_list=param_grid_list_poly,\n", - " is_discrete=False,\n", - " scoring='explained_variance',\n", - " verbose=3\n", - " )\n", - "\n", - " # Fit the model to the training data\n", - " sel_reg.fit(X_train_reg, y_train_reg)\n", - "\n", - " # Predict outcomes for the test set\n", - " predictions_reg = sel_reg.predict(X_test_reg)\n", - "\n", - " # Evaluate the model\n", - " mse = mean_squared_error(y_test_reg, predictions_reg)\n", - "\n", - " # Print the evaluation metric\n", - " print(f\"Mean Squared Error: {mse}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['linear', 'forest', 'gbf', 'nnet', 'poly']\n" - ] - } - ], - "source": [ - "# These are all of the supported models that we have that have built in hyper parameters already included\n", - "print(sel_reg.supported_models)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-0.518 total time= 0.1s\n", - "[CV 2/2] END .................................., score=-0.552 total time= 0.0s\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-0.287 total time= 1.3s\n", - "[CV 2/2] END .................................., score=-0.293 total time= 1.3s\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-0.286 total time= 3.1s\n", - "[CV 2/2] END .................................., score=-0.274 total time= 3.1s\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-0.305 total time= 3.2s\n", - "[CV 2/2] END .................................., score=-0.305 total time= 3.0s\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-0.526 total time= 0.6s\n", - "[CV 2/2] END ................................., score=-12.077 total time= 0.5s\n", - "Best estimator RandomForestRegressor() and best score -0.27976201134927425 and best params {}\n", - "Mean Squared Error: 0.2508316133481009\n" - ] - } - ], - "source": [ - "# To try every type of model simply use the \"all\" option\n", - "with warnings.catch_warnings():\n", - " warnings.simplefilter(\"ignore\")\n", - " sel_reg = SearchEstimatorList(\n", - " estimator_list='all', \n", - " param_grid_list=None,\n", - " is_discrete=False,\n", - " scaling=True,\n", - " verbose=5\n", - " )\n", - "\n", - " # Fit the model to the training data\n", - " sel_reg.fit(X_train_reg, y_train_reg)\n", - "\n", - " # Predict outcomes for the test set\n", - " predictions_reg = sel_reg.predict(X_test_reg)\n", - "\n", - " # Evaluate the model\n", - " mse = mean_squared_error(y_test_reg, predictions_reg)\n", - "\n", - " # Print the evaluation metric\n", - " print(f\"Mean Squared Error: {mse}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Scoring functions\n", - "\n", - "Using a custom scoring function. See https://scikit-learn.org/stable/modules/model_evaluation.html for how to make your own scoring metric\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-0.741 total time= 0.0s\n", - "[CV 2/2] END .................................., score=-0.822 total time= 0.0s\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-2.404 total time= 0.8s\n", - "[CV 2/2] END .................................., score=-1.671 total time= 0.8s\n", - "Best estimator ElasticNetCV() and best score -0.7813657065847333 and best params {}\n", - "Root Mean Squared Error: 0.7490149943228499\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.metrics import mean_squared_error\n", - "from sklearn.metrics import make_scorer\n", - "\n", - "def root_mean_squared_error(y_true, y_pred):\n", - " mse = mean_squared_error(y_true, y_pred)\n", - " rmse = np.sqrt(mse)\n", - " return rmse\n", - "loss_function = make_scorer(root_mean_squared_error, greater_is_better=False)\n", - "\n", - "sel_reg = SearchEstimatorList(\n", - " estimator_list=estimator_list_reg, \n", - " param_grid_list=None,\n", - " is_discrete=False,\n", - " scoring=loss_function,\n", - " verbose=3\n", - ")\n", - "\n", - "# Fit the model to the training data\n", - "sel_reg.fit(X_train_reg, y_train_reg)\n", - "\n", - "# Predict outcomes for the test set\n", - "predictions_reg = sel_reg.predict(X_test_reg)\n", - "\n", - "# Evaluate the model\n", - "rmse = root_mean_squared_error(y_test_reg, predictions_reg)\n", - "\n", - "# Print the evaluation metric\n", - "print(f\"Root Mean Squared Error: {rmse}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# What this means for EconML?\n", - "\n", - "By integrating the SearchEstimatorList into econml, we can gain a number of benefits in these categories:\n", - "\n", - " Model Selection: econml contains many different models, each with its own assumptions and use cases. By using SearchEstimatorList, you can more easily compare the performance of different models on your data and select the best one.\n", - "\n", - " Hyperparameter Tuning: Many of the models in econml have hyperparameters that need to be tuned for optimal performance. SearchEstimatorList can automate this process by performing a grid search over specified hyperparameters for each model.\n", - "\n", - " Efficiency: Instead of having to manually train each model and tune its hyperparameters, SearchEstimatorList can do this all at once. This can save a significant amount of time and make the model building process more efficient.\n", - "\n", - "See the example below with data taken fromt he Customer Segmentation at an Online Media Company Notebook" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "No scoring value was given. Using default score method neg_mean_squared_error.\n", - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "*** Causal Estimate ***\n", - "\n", - "## Identified estimand\n", - "Estimand type: nonparametric-ate\n", - "\n", - "### Estimand : 1\n", - "Estimand name: backdoor\n", - "Estimand expression:\n", - " d \n", - "────────────(E[log_demand|income,friends_count,days_⟨visited,⟩_hours,age,songs\n", - "d[log_price] \n", - "\n", - " \n", - "_purchased,has_membership,is_US,account_age])\n", - " \n", - "Estimand assumption 1, Unconfoundedness: If U→{log_price} and U→log_demand then P(log_demand|log_price,income,friends_count,days_visited,avg_hours,age,songs_purchased,has_membership,is_US,account_age,U) = P(log_demand|log_price,income,friends_count,days_visited,avg_hours,age,songs_purchased,has_membership,is_US,account_age)\n", - "\n", - "## Realized estimand\n", - "b: log_demand~log_price+income+friends_count+days_visited+avg_hours+age+songs_purchased+has_membership+is_US+account_age | income\n", - "Target units: ate\n", - "\n", - "## Estimate\n", - "Mean value: 2.6518132830256684\n", - "Effect estimates: [ 2.57968831 -0.23224908 4.35502223 ... 0.85234463 -3.53167996\n", - " 6.99294565]\n", - "\n" - ] - } - ], - "source": [ - "# Import the sample pricing data\n", - "file_url = \"https://msalicedatapublic.z5.web.core.windows.net/datasets/Pricing/pricing_sample.csv\"\n", - "train_data = pd.read_csv(file_url)\n", - "\n", - "# Data sample\n", - "train_data.head()\n", - "\n", - "# Define estimator inputs\n", - "train_data[\"log_demand\"] = np.log(train_data[\"demand\"])\n", - "train_data[\"log_price\"] = np.log(train_data[\"price\"])\n", - "\n", - "Y = train_data[\"log_demand\"].values\n", - "T = train_data[\"log_price\"].values\n", - "X = train_data[[\"income\"]].values # features\n", - "confounder_names = [\"account_age\", \"age\", \"avg_hours\", \"days_visited\", \"friends_count\", \"has_membership\", \"is_US\", \"songs_purchased\"]\n", - "W = train_data[confounder_names].values\n", - "\n", - "# Get test data\n", - "X_test = np.linspace(0, 5, 100).reshape(-1, 1)\n", - "X_test_data = pd.DataFrame(X_test, columns=[\"income\"])\n", - "\n", - "# initiate an EconML cate estimator\n", - "est = LinearDML(model_y='gbf', model_t='gbf',\n", - " featurizer=PolynomialFeatures(degree=2, include_bias=False))\n", - "\n", - "# fit through dowhy\n", - "est_dw = est.dowhy.fit(Y, T, X=X, W=W, outcome_names=[\"log_demand\"], treatment_names=[\"log_price\"], feature_names=[\"income\"],\n", - " confounder_names=confounder_names, inference=\"statsmodels\")\n", - "\n", - "lineardml_estimate = est_dw.estimate_\n", - "print(lineardml_estimate)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Define underlying treatment effect function given DGP\n", - "def gamma_fn(X):\n", - " return -3 - 14 * (X[\"income\"] < 1)\n", - "\n", - "def beta_fn(X):\n", - " return 20 + 0.5 * (X[\"avg_hours\"]) + 5 * (X[\"days_visited\"] > 4)\n", - "\n", - "def demand_fn(data, T):\n", - " Y = gamma_fn(data) * T + beta_fn(data)\n", - " return Y\n", - "\n", - "def true_te(x, n, stats):\n", - " if x < 1:\n", - " subdata = train_data[train_data[\"income\"] < 1].sample(n=n, replace=True)\n", - " else:\n", - " subdata = train_data[train_data[\"income\"] >= 1].sample(n=n, replace=True)\n", - " te_array = subdata[\"price\"] * gamma_fn(subdata) / (subdata[\"demand\"])\n", - " if stats == \"mean\":\n", - " return np.mean(te_array)\n", - " elif stats == \"median\":\n", - " return np.median(te_array)\n", - " elif isinstance(stats, int):\n", - " return np.percentile(te_array, stats)\n", - "\n", - "# Get the estimate and range of true treatment effect\n", - "truth_te_estimate = np.apply_along_axis(true_te, 1, X_test, 1000, \"mean\") # estimate\n", - "truth_te_upper = np.apply_along_axis(true_te, 1, X_test, 1000, 95) # upper level\n", - "truth_te_lower = np.apply_along_axis(true_te, 1, X_test, 1000, 5) # lower level\n", - "\n", - "te_pred = est_dw.effect(X_test).flatten()\n", - "te_pred_interval = est_dw.effect_interval(X_test)\n", - "\n", - "# Compare the estimate and the truth\n", - "plt.figure(figsize=(10, 6))\n", - "plt.plot(X_test.flatten(), te_pred, label=\"Sales Elasticity Prediction\")\n", - "plt.plot(X_test.flatten(), truth_te_estimate, \"--\", label=\"True Elasticity\")\n", - "plt.fill_between(\n", - " X_test.flatten(),\n", - " te_pred_interval[0].flatten(),\n", - " te_pred_interval[1].flatten(),\n", - " alpha=0.2,\n", - " label=\"95% Confidence Interval\",\n", - ")\n", - "plt.fill_between(\n", - " X_test.flatten(),\n", - " truth_te_lower,\n", - " truth_te_upper,\n", - " alpha=0.2,\n", - " label=\"True Elasticity Range\",\n", - ")\n", - "plt.xlabel(\"Income\")\n", - "plt.ylabel(\"Songs Sales Elasticity\")\n", - "plt.title(\"Songs Sales Elasticity vs Income\")\n", - "plt.legend(loc=\"lower right\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "No scoring value was given. Using default score method neg_mean_squared_error.\n", - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing estimator: RandomForestRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV] END .................................................... total time= 1.1s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV] END .................................................... total time= 0.7s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing estimator: MLPRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV] END .................................................... total time= 0.3s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV] END .................................................... total time= 0.4s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best estimator RandomForestRegressor() and best score -0.007087413279468611 and best params {}\n", - "Processing estimator: RandomForestRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV] END .................................................... total time= 2.3s\n", - "[CV] END .................................................... total time= 2.3s\n", - "Processing estimator: MLPRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV] END .................................................... total time= 12.6s\n", - "[CV] END .................................................... total time= 10.5s\n", - "Best estimator RandomForestRegressor() and best score -0.015753967716546576 and best params {}\n", - "Processing estimator: RandomForestRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV] END .................................................... total time= 0.7s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV] END .................................................... total time= 0.7s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing estimator: MLPRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV] END .................................................... total time= 0.2s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV] END .................................................... total time= 0.3s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best estimator RandomForestRegressor() and best score -0.006845612318994855 and best params {}\n", - "Processing estimator: RandomForestRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV] END .................................................... total time= 2.2s\n", - "[CV] END .................................................... total time= 2.1s\n", - "Processing estimator: MLPRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV] END .................................................... total time= 12.2s\n", - "[CV] END .................................................... total time= 14.3s\n", - "Best estimator RandomForestRegressor() and best score -0.014455828883075759 and best params {}\n", - "*** Causal Estimate ***\n", - "\n", - "## Identified estimand\n", - "Estimand type: nonparametric-ate\n", - "\n", - "### Estimand : 1\n", - "Estimand name: backdoor\n", - "Estimand expression:\n", - " d \n", - "────────────(E[log_demand|income,friends_count,days_⟨visited,⟩_hours,age,songs\n", - "d[log_price] \n", - "\n", - " \n", - "_purchased,has_membership,is_US,account_age])\n", - " \n", - "Estimand assumption 1, Unconfoundedness: If U→{log_price} and U→log_demand then P(log_demand|log_price,income,friends_count,days_visited,avg_hours,age,songs_purchased,has_membership,is_US,account_age,U) = P(log_demand|log_price,income,friends_count,days_visited,avg_hours,age,songs_purchased,has_membership,is_US,account_age)\n", - "\n", - "## Realized estimand\n", - "b: log_demand~log_price+income+friends_count+days_visited+avg_hours+age+songs_purchased+has_membership+is_US+account_age | income\n", - "Target units: ate\n", - "\n", - "## Estimate\n", - "Mean value: -0.9764341213588181\n", - "Effect estimates: [-1.06939218 -1.44817143 -0.81689907 ... -1.30445479 -1.87209822\n", - " -0.40427838]\n", - "\n" - ] - } - ], - "source": [ - "# initiate an EconML cate estimator\n", - "\n", - "est = LinearDML(model_y=['forest', 'nnet'], model_t=['nnet', 'forest'], scaling=False,\n", - " featurizer=PolynomialFeatures(degree=2, include_bias=False))\n", - "\n", - "# fit through dowhy\n", - "est_dw = est.dowhy.fit(Y, T, X=X, W=W, outcome_names=[\"log_demand\"], treatment_names=[\"log_price\"], feature_names=[\"income\"],\n", - " confounder_names=confounder_names, inference=\"statsmodels\")\n", - "\n", - "lineardml_estimate = est_dw.estimate_\n", - "print(lineardml_estimate)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "te_pred = est_dw.effect(X_test).flatten()\n", - "te_pred_interval = est_dw.effect_interval(X_test)\n", - "\n", - "# Compare the estimate and the truth\n", - "plt.figure(figsize=(10, 6))\n", - "plt.plot(X_test.flatten(), te_pred, label=\"Sales Elasticity Prediction\")\n", - "plt.plot(X_test.flatten(), truth_te_estimate, \"--\", label=\"True Elasticity\")\n", - "plt.fill_between(\n", - " X_test.flatten(),\n", - " te_pred_interval[0].flatten(),\n", - " te_pred_interval[1].flatten(),\n", - " alpha=0.2,\n", - " label=\"95% Confidence Interval\",\n", - ")\n", - "plt.fill_between(\n", - " X_test.flatten(),\n", - " truth_te_lower,\n", - " truth_te_upper,\n", - " alpha=0.2,\n", - " label=\"True Elasticity Range\",\n", - ")\n", - "plt.xlabel(\"Income\")\n", - "plt.ylabel(\"Songs Sales Elasticity\")\n", - "plt.title(\"Songs Sales Elasticity vs Income\")\n", - "plt.legend(loc=\"lower right\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.15" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 6d41ada7b0c4df89aa5dd2be566f1b36246430f3 Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Wed, 8 Nov 2023 17:19:37 -0500 Subject: [PATCH 05/19] Fix some model selection logic Signed-off-by: Keith Battocchi --- econml/sklearn_extensions/linear_model.py | 3 +- econml/sklearn_extensions/model_selection.py | 150 +++++++++++-------- 2 files changed, 91 insertions(+), 62 deletions(-) diff --git a/econml/sklearn_extensions/linear_model.py b/econml/sklearn_extensions/linear_model.py index 8045d23bf..7c29cbd70 100644 --- a/econml/sklearn_extensions/linear_model.py +++ b/econml/sklearn_extensions/linear_model.py @@ -1276,7 +1276,8 @@ class WeightedLassoCVWrapper(_PairedEstimatorWrapper): _known_params = set(['eps', 'n_alphas', 'alphas', 'fit_intercept', 'normalize', 'max_iter', 'tol', 'copy_X', 'cv', 'verbose', 'n_jobs', 'random_state', 'selection']) - _post_fit_attrs = set(['alpha_', 'alphas_', 'coef_', 'dual_gap_', 'intercept_', 'n_iter_', 'n_features_in_']) + _post_fit_attrs = set(['alpha_', 'alphas_', 'coef_', 'dual_gap_', + 'intercept_', 'n_iter_', 'n_features_in_', 'mse_path_']) class WeightedLassoWrapper(_PairedEstimatorWrapper): diff --git a/econml/sklearn_extensions/model_selection.py b/econml/sklearn_extensions/model_selection.py index b123fb5a2..94d16b091 100644 --- a/econml/sklearn_extensions/model_selection.py +++ b/econml/sklearn_extensions/model_selection.py @@ -378,10 +378,10 @@ def __init__(self, model): def train(self, is_selecting, *args, groups=None, **kwargs): # whether selecting or not, need to train the model on the data - # TODO: want to get out-of-sample score here if selecting, which - # would require cross-validation, but want to respect grouping, stratifying, etc. _fit_with_groups(self.model, *args, groups=groups, **kwargs) if is_selecting and hasattr(self.model, 'score'): + # TODO: we need to alter this to use out-of-sample score here, which + # will require cross-validation, but should respect grouping, stratifying, etc. self._score = self.model.score(*args, **kwargs) return self @@ -394,6 +394,69 @@ def best_score(self): return self._score +def _copy_to(m1, m2, attrs, insert_underscore=False): + for attr in attrs: + setattr(m2, attr, getattr(m1, attr + "_" if insert_underscore else attr)) + + +def _convert_linear_model(model, new_cls, extra_attrs=[]): + new_model = new_cls() + # copy common parameters + _copy_to(model, new_model, ["fit_intercept", "max_iter", + "tol", + "random_state"]) + # copy common fitted variables + _copy_to(model, new_model, ["coef_", "intercept_", "n_features_in_", "n_iter_"]) + # copy attributes unique to this class + _copy_to(model, new_model, extra_attrs) + return new_model + + +def _to_logisticRegression(model: LogisticRegressionCV): + lr = _convert_linear_model(model, LogisticRegression) + _copy_to(model, lr, ["penalty", "dual", "intercept_scaling", + "class_weight", + "solver", "multi_class", + "verbose", "n_jobs"]) + _copy_to(model, lr, ["classes_"]) + + _copy_to(model, lr, ["C", "l1_ratio"], True) # these are arrays in LogisticRegressionCV, need to convert them next + + # make sure all classes agree on best c/l1 combo + assert np.isclose(lr.C, lr.C.flatten()[0]).all() + assert np.equal(lr.l1_ratio, None).all() or np.isclose(lr.l1_ratio, lr.l1_ratio.flatten()[0]).all() + lr.C = lr.C[0] + lr.l1_ratio = lr.l1_ratio[0] + avg_scores = np.average([v for k, v in model.scores_.items()], axis=1) # average over folds + best_scores = np.max(avg_scores, axis=tuple(range(1, avg_scores.ndim))) # average score of best c/l1 combo + assert np.isclose(best_scores, best_scores.flatten()[0]).all() # make sure all folds agree on best c/l1 combo + return lr, best_scores[0] + + +def _convert_linear_regression(model, new_cls, extra_attrs=["positive"]): + new_model = _convert_linear_model(model, new_cls, ["normalize", "copy_X", + "n_iter_"]) + _copy_to(model, new_model, ["alpha"], True) + return new_model + + +def _to_elasticNet(model: ElasticNetCV, is_lasso=False, cls=None, extra_attrs=[]): + cls = cls or (Lasso if is_lasso else ElasticNet) + new_model = _convert_linear_regression(model, cls, extra_attrs + ['selection', 'warm_start', + 'dual_gap_']) + if not is_lasso: + # l1 ratio doesn't apply to Lasso, only ElasticNet + _copy_to(model, new_model, ["l1_ratio"], True) + max_score = np.max(np.mean(model.mse_path_, axis=-1)) # last dimension in mse_path is folds, so average over that + return new_model, max_score + + +def _to_ridge(model, cls=Ridge, extra_attrs=["positive"]): + ridge = _convert_linear_regression(model, cls, extra_attrs + ["_normalize", "solver"]) + best_score = model.best_score_ + return ridge, best_score + + class SklearnCVSelector(SingleModelSelector): """ Wraps one of sklearn's CV classes in the ModelSelector interface @@ -412,48 +475,32 @@ def can_wrap(model): @staticmethod def _model_mapping(): - return {LogisticRegressionCV: (LogisticRegression, - ["C", "l1_ratio"], - [], - ["classes_", "coef_", "intercept_", "n_features_in_", "n_iter_"]), - ElasticNetCV: (ElasticNet, - ["alpha", "l1_ratio"], - ["precompute"], - ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), - LassoCV: (Lasso, - ["alpha"], - ["precompute"], - ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), - RidgeCV: (Ridge, - ["alpha"], - [], - ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), - RidgeClassifierCV: (RidgeClassifier, - ["alpha"], - [], - ["label_binarizer", "coef_", "intercept_", "n_features_in_", "n_iter_"]), - MultiTaskElasticNetCV: (MultiTaskElasticNet, - ["alpha", "l1_ratio"], - ["precompute"], - ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), - MultiTaskLassoCV: (MultiTaskLasso, - ["alpha"], - [], - ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), - WeightedLassoCVWrapper: (WeightedLassoWrapper, - ["alpha"], - [], - ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]) + return {LogisticRegressionCV: _to_logisticRegression, + ElasticNetCV: _to_elasticNet, + LassoCV: lambda model: _to_elasticNet(model, True, None, ["positive"]), + RidgeCV: _to_ridge, + RidgeClassifierCV: lambda model: _to_ridge(model, RidgeClassifier, ["positive", "class_weight", + "_label_binarizer"]), + MultiTaskElasticNetCV: lambda model: _to_elasticNet(model, False, MultiTaskElasticNet, extra_attrs=[]), + MultiTaskLassoCV: lambda model: _to_elasticNet(model, True, MultiTaskLasso, extra_attrs=[]), + WeightedLassoCVWrapper: lambda model: _to_elasticNet(model, True, WeightedLassoWrapper, + extra_attrs=[]), } def train(self, is_selecting: bool, *args, groups=None, **kwargs): if is_selecting: - _fit_with_groups(self.searcher, *args, groups=groups, **kwargs) - self._best_model = self._extract_best_model() - # TODO: ideally, want the out-of-sample score here instead; - # but this is not exposed in a consistent way - self._best_score = self.searcher.score(*args, **kwargs) + + if isinstance(self.searcher, GridSearchCV) or isinstance(self.searcher, RandomizedSearchCV): + self._best_model = self.searcher.best_estimator_ + self._best_score = self.searcher.best_score_ + + for known_type in self._model_mapping().keys(): + if isinstance(self.searcher, known_type): + converter = self._model_mapping()[known_type] + self._best_model, self._best_score = converter(self.searcher) + return self + else: # don't need to use _fit_with_groups here since none of these models support it self.best_model.fit(*args, **kwargs) @@ -467,26 +514,6 @@ def best_model(self): def best_score(self): return self._best_score - def _extract_best_model(self): - if isinstance(self.searcher, GridSearchCV) or isinstance(self.searcher, RandomizedSearchCV): - return self.searcher.best_estimator_ - else: - for known_type in self._model_mapping().keys(): - if isinstance(self.searcher, known_type): - model_type, opt_params, strip_params, fit_vars = self._model_mapping()[known_type] - model = model_type() - # set all shared parameters - for param in model.get_params().keys() & self.searcher.get_params().keys() - set(strip_params): - setattr(model, param, getattr(self.searcher, param)) - # update learned hyperparameters with best values - for param in opt_params: - setattr(model, param, getattr(self.searcher, param + "_")) - # set all fitted variables - for var in fit_vars: - setattr(model, var, getattr(self.searcher, var)) - return model - raise ValueError(f"Unsupported type: {type(self.searcher)}") - class ListSelector(SingleModelSelector): """ @@ -534,8 +561,9 @@ def get_selector(input, is_discrete, *, random_state=None, cv=None, wrapper=Grid named_models = { 'linear': (LogisticRegressionCV(random_state=random_state, cv=cv) if is_discrete else WeightedLassoCVWrapper(random_state=random_state, cv=cv)), - 'forest': (RandomForestClassifier(random_state=random_state) if is_discrete - else RandomForestRegressor(random_state=random_state)), + 'forest': (GridSearchCV(RandomForestClassifier(random_state=random_state) if is_discrete + else RandomForestRegressor(random_state=random_state), + param_grid={}, cv=cv)), } if isinstance(input, ModelSelector): # we've already got a model selector, don't need to do anything return input From 0435b26ee7f03fd203c7df8acbb9f402936f1186 Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Thu, 9 Nov 2023 10:48:09 -0500 Subject: [PATCH 06/19] Remove deprecated "normalize" param Signed-off-by: Keith Battocchi --- econml/sklearn_extensions/model_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/econml/sklearn_extensions/model_selection.py b/econml/sklearn_extensions/model_selection.py index 94d16b091..aafed51cb 100644 --- a/econml/sklearn_extensions/model_selection.py +++ b/econml/sklearn_extensions/model_selection.py @@ -434,7 +434,7 @@ def _to_logisticRegression(model: LogisticRegressionCV): def _convert_linear_regression(model, new_cls, extra_attrs=["positive"]): - new_model = _convert_linear_model(model, new_cls, ["normalize", "copy_X", + new_model = _convert_linear_model(model, new_cls, ["copy_X", "n_iter_"]) _copy_to(model, new_model, ["alpha"], True) return new_model From db7441345669dbed0c5ed8acc0e1ca610ea32147 Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Thu, 9 Nov 2023 13:06:16 -0500 Subject: [PATCH 07/19] Adjust tests for lack of linear_first_stages Signed-off-by: Keith Battocchi --- econml/tests/test_dml.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index 2f321c5f2..975323a05 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -1051,7 +1051,10 @@ def test_linear_sparse(self): Y = T * (x @ a) + xw @ g + err_Y # Test sparse estimator # --> test coef_, intercept_ - sparse_dml = SparseLinearDML(fit_cate_intercept=False) + # with this DGP, since T depends linearly on X, Y depends on X quadratically + # so we should use a quadratic featurizer + sparse_dml = SparseLinearDML(fit_cate_intercept=False, model_y=Pipeline([('poly', PolynomialFeatures(2)), + ('lr', LassoCV())])) sparse_dml.fit(Y, T, X=x, W=w) np.testing.assert_allclose(a, sparse_dml.coef_, atol=2e-1) with pytest.raises(AttributeError): @@ -1125,7 +1128,9 @@ def _test_sparse(n_p, d_w, n_r): y[fold * n:(fold + 1) * n] = y_f t[fold * n:(fold + 1) * n] = t_f - dml = SparseLinearDML(model_y=LinearRegression(fit_intercept=False), + # we have quadratic terms in y, so we need to pipeline with a quadratic featurizer + dml = SparseLinearDML(model_y=Pipeline([('poly', PolynomialFeatures(2)), + ('lr', LinearRegression(fit_intercept=False))]), model_t=LinearRegression(fit_intercept=False), fit_cate_intercept=False) dml.fit(y, t, X=x, W=w) From 7e61c00cf1d14e136ca919a097e46de11c47f566 Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Thu, 9 Nov 2023 16:24:14 -0500 Subject: [PATCH 08/19] Remove vestigal functionality Signed-off-by: Keith Battocchi --- econml/dml/dml.py | 46 +- econml/sklearn_extensions/model_selection.py | 220 ----- .../model_selection_utils.py | 817 ------------------ 3 files changed, 5 insertions(+), 1078 deletions(-) delete mode 100644 econml/sklearn_extensions/model_selection_utils.py diff --git a/econml/dml/dml.py b/econml/dml/dml.py index caa12e0c2..e32b6685d 100644 --- a/econml/dml/dml.py +++ b/econml/dml/dml.py @@ -367,14 +367,6 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn The estimator for fitting the response residuals to the treatment residuals. Must implement `fit` and `predict` methods, and must be a linear model for correctness. - param_list: list or 'auto', default 'auto' - The list of parameters to be used during cross-validation. - If 'auto', it will be chosen based on the model type. - - scaling: bool, default True - Whether to scale the features during the estimation process. - Scaling can help improve the performance of some models. - featurizer: :term:`transformer`, optional Must support fit_transform and transform. Used to create composite features in the final CATE regression. It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X). @@ -491,21 +483,13 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn def __init__(self, *, model_y, model_t, model_final, - param_list_y=None, - param_list_t=None, - scoring_y=None, - scoring_t=None, - scaling=False, featurizer=None, treatment_featurizer=None, fit_cate_intercept=True, - linear_first_stages=False, + linear_first_stages="deprecated", discrete_treatment=False, categories='auto', - verbose=2, # New cv=2, - grid_folds=2, # New - n_jobs=None, # New mc_iters=None, mc_agg='mean', random_state=None, @@ -513,19 +497,11 @@ def __init__(self, *, use_ray=False, ray_remote_func_options=None ): - # TODO: consider whether we need more care around stateful featurizers, - # since we clone it and fit separate copies self.fit_cate_intercept = fit_cate_intercept + if linear_first_stages != "deprecated": + warn("The linear_first_stages parameter is deprecated and will be removed in a future version of EconML", + DeprecationWarning) self.linear_first_stages = linear_first_stages - self.scaling = scaling - self.param_list_y = param_list_y - self.param_list_t = param_list_t - self.scoring_y = scoring_y - self.scoring_t = scoring_t - self.verbose = verbose - self.cv = cv - self.grid_folds = grid_folds - self.n_jobs = n_jobs self.featurizer = clone(featurizer, safe=False) self.model_y = clone(model_y, safe=False) self.model_t = clone(model_t, safe=False) @@ -741,19 +717,13 @@ class LinearDML(StatsModelsCateEstimatorMixin, DML): def __init__(self, *, model_y='auto', model_t='auto', - param_list_y=None, - param_list_t=None, featurizer=None, treatment_featurizer=None, fit_cate_intercept=True, - linear_first_stages=True, + linear_first_stages="deprecated", discrete_treatment=False, categories='auto', - scaling=True, - verbose=2, cv=2, - grid_folds=2, - n_jobs=None, mc_iters=None, mc_agg='mean', random_state=None, @@ -764,8 +734,6 @@ def __init__(self, *, super().__init__(model_y=model_y, model_t=model_t, - param_list_y=param_list_y, - param_list_t=param_list_t, model_final=None, featurizer=featurizer, treatment_featurizer=treatment_featurizer, @@ -773,11 +741,7 @@ def __init__(self, *, linear_first_stages=linear_first_stages, discrete_treatment=discrete_treatment, categories=categories, - scaling=scaling, - verbose=verbose, cv=cv, - n_jobs=n_jobs, - grid_folds=grid_folds, mc_iters=mc_iters, mc_agg=mc_agg, random_state=random_state, diff --git a/econml/sklearn_extensions/model_selection.py b/econml/sklearn_extensions/model_selection.py index aafed51cb..4b1456d51 100644 --- a/econml/sklearn_extensions/model_selection.py +++ b/econml/sklearn_extensions/model_selection.py @@ -29,10 +29,6 @@ from sklearn.utils.validation import _num_samples from .linear_model import WeightedLassoCVWrapper, WeightedLassoWrapper -from .model_selection_utils import (auto_hyperparameters, can_handle_multitask, get_complete_estimator_list, - has_random_state, is_data_scaled, is_likely_multi_task, - is_mlp, is_polynomial_pipeline, just_one_model_no_params, make_model_multi_task, - make_param_multi_task, param_grid_is_empty, supports_sample_weight) def _split_weighted_sample(self, X, y, sample_weight, is_stratified=False): @@ -584,222 +580,6 @@ def get_selector(input, is_discrete, *, random_state=None, cv=None, wrapper=Grid return FixedModelSelector(input) -class SearchEstimatorList(BaseEstimator): - """ - The SearchEstimatorList is a utility class for hyperparameter tuning. - It provides a convenient way to perform GridSearch cross-validation for - a list of estimators. The class automates the process of hyperparameter - tuning, model fitting, and prediction for multiple estimators. - - - Parameters - ---------- - estimator_list : list, string, or sklearn model object, default ['linear', 'forest'] - A list of names of estimators to be used for grid search. - - param_grid_list : list or 'auto', default 'auto' - A list of dictionaries specifying hyperparameters for each estimator in `estimator_list`. If set to 'auto', - the class automatically generates hyperparameters for the estimators. - - scaling : bool, default True - Indicates whether to scale the input data using StandardScaler. - - is_discrete : bool, default False - Specifies if the models in `estimator_list` are discrete. - - scoring : str or None, default None - The scoring metric to be used for selecting the best estimator. - - n_jobs : int or None, default None - The number of CPU cores to use for parallel processing during grid search. - - refit : bool, default True - Determines whether to refit the best estimator with the entire dataset after grid search. - - grid_folds : int, default 3 - Number of folds for the cross-validation during grid search. Must be at least 2. - - verbose : int, default 2 - Verbosity level of the class's methods and inner workings. - - pre_dispatch : str, default '2*n_jobs' - Controls the number of jobs that get dispatched during parallel execution of the grid search. - - random_state : int, RandomState instance, or None, default None - If int, `random_state` is the seed used by the random number generator; - If `RandomState` instance, `random_state` is the random number generator; - If None, the random number generator is the `RandomState` instance used by `np.random`. - Used when `shuffle` == True. - - error_score : float or 'raise', default np.nan - The value assigned to the score if an error occurs during fitting an estimator. If set to 'raise', - an error is raised. - - return_train_score : bool, default False - Determines whether to include training scores in the `cv_results_` attribute of the class. - - categorical_indices : str, int, list, or None default None - List of categorical indices - """ - - def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, scaling=False, - is_discrete=False, scoring=None, n_jobs=None, refit=True, cv=2, verbose=2, - pre_dispatch='2*n_jobs', random_state=None, - error_score=np.nan, return_train_score=False, categorical_indices=None): - self.estimator_list = estimator_list - self.complete_estimator_list = get_complete_estimator_list( - clone(estimator_list, safe=False), is_discrete=is_discrete, random_state=random_state) - - # TODO Add in more functionality by checking if it's an empty list. If it's just 1 dictionary - # then we're going to need to turn it into a list - # Just do more cases - if param_grid_list == 'auto': - self.param_grid_list = auto_hyperparameters( - estimator_list=self.complete_estimator_list, is_discrete=is_discrete) - elif (param_grid_list is None): - self.param_grid_list = len(self.complete_estimator_list) * [{}] - else: - if isinstance(param_grid_list, dict): - self.param_grid_list = [param_grid_list] - else: - self.param_grid_list = param_grid_list - self.categorical_indices = categorical_indices - self.scoring = scoring - if scoring is None: - if is_discrete: - self.scoring = 'f1_macro' - else: - self.scoring = 'neg_mean_squared_error' - warnings.warn(f"No scoring value was given. Using default score method {self.scoring}.") - self.scaling = scaling - self.n_jobs = n_jobs - self.refit = refit - self.cv = cv - self.verbose = verbose - self.random_state = random_state - self.pre_dispatch = pre_dispatch - self.error_score = error_score - self.return_train_score = return_train_score - self.is_discrete = is_discrete - self.supported_models = ['linear', 'forest', 'gbf', 'nnet', 'poly'] - - def fit(self, X, y, *, sample_weight=None, groups=None): - self._search_list = [] - - # Change estimators if multi_task - if is_likely_multi_task(y): - for index, estimator in enumerate(self.complete_estimator_list): - if not can_handle_multitask(model=estimator, is_discrete=self.is_discrete): - self.complete_estimator_list[index] = make_model_multi_task( - model=estimator, is_discrete=self.is_discrete) - if self.param_grid_list is not None: - self.param_grid_list[index] = make_param_multi_task( - estimator=estimator, param_grid=self.param_grid_list[index]) - - if self.scaling: - if not is_data_scaled(X): - self.scaler = StandardScaler() - scaled_X = self.scaler.fit_transform(X) - - if just_one_model_no_params(estimator_list=self.complete_estimator_list, param_list=self.param_grid_list): - # Just fit the model and return it, no need for grid search or for loop - estimator = self.complete_estimator_list[0] - if self.random_state is not None: - if has_random_state(model=estimator): - # For a polynomial pipeline, you have to set the random state of the linear part, - # the polynomial part doesn't have random state - if is_polynomial_pipeline(estimator): - estimator = estimator.set_params(linear__random_state=self.random_state) - else: - estimator.set_params(random_state=self.random_state) - if is_polynomial_pipeline(estimator=estimator): - # Only linear part of pipeline can handle sampleweight - estimator.fit(X, y, linear__sample_weight=sample_weight) - elif not supports_sample_weight(estimator=estimator): - estimator.fit(X, y) - else: - estimator.fit(X, y, sample_weight=sample_weight) - self.best_ind_ = None - self.best_estimator_ = estimator - self.best_score_ = None - self.best_params_ = {} - return self - for estimator, param_grid in zip(self.complete_estimator_list, self.param_grid_list): - if self.verbose: - if is_polynomial_pipeline(estimator): - print(f"Processing estimator: {type(estimator.named_steps['linear']).__name__}") - else: - print(f"Processing estimator: {type(estimator).__name__}") - try: - if self.random_state is not None: - if has_random_state(model=estimator): - # For a polynomial pipeline, you have to set the random state of the linear part, - # the polynomial part doesn't have random state - if is_polynomial_pipeline(estimator): - estimator = estimator.set_params(linear__random_state=self.random_state) - else: - estimator.set_params(random_state=self.random_state) - - temp_search = GridSearchCV(estimator, param_grid, scoring=self.scoring, - n_jobs=self.n_jobs, refit=self.refit, cv=self.cv, verbose=self.verbose, - pre_dispatch=self.pre_dispatch, error_score=self.error_score, - return_train_score=self.return_train_score) - if self.scaling: - # Add sample weights to the linear layer, not the polynomial featurizer - if is_polynomial_pipeline(estimator=estimator): - temp_search.fit(scaled_X, y, groups=groups, linear__sample_weight=sample_weight) - # MLP does not have sample weight so we cannot fit the search - elif is_mlp(estimator=estimator): - temp_search.fit(scaled_X, y, groups=groups) - else: - temp_search.fit(scaled_X, y, groups=groups, sample_weight=sample_weight) - self._search_list.append(temp_search) - else: - if is_polynomial_pipeline(estimator=estimator): - temp_search.fit(X, y, groups=groups, linear__sample_weight=sample_weight) - elif not supports_sample_weight(estimator=estimator): - temp_search.fit(X, y, groups=groups) - else: - temp_search.fit(X, y, groups=groups, sample_weight=sample_weight) - self._search_list.append(temp_search) - except (ValueError, TypeError, FitFailedWarning) as e: - # This warning catches errors during the fit operation. - warning_msg = f"Warning: {e} for estimator {estimator} and param_grid {param_grid}" - warnings.warn(warning_msg, category=UserWarning) - if not hasattr(temp_search, 'cv_results_') and not param_grid_is_empty(param_grid=param_grid): - # This warning catches a problem after fit has run with no exception, - # however if there is no cv_results_ this indicates a failed fit operation. - warning_msg = (f"Warning: estimator {estimator} and param_grid {param_grid} " - "failed, has no attribute cv_results_.") - warnings.warn(warning_msg, category=FitFailedWarning) - try: - self.best_ind_ = np.argmax([search.best_score_ for search in self._search_list]) - except Exception as e: - warning_msg = f"Failed for estimator {estimator} and param_grid {param_grid} with this error {e}." - raise Exception(warning_msg) from e - self.best_estimator_ = self._search_list[self.best_ind_].best_estimator_ - self.best_score_ = self._search_list[self.best_ind_].best_score_ - self.best_params_ = self._search_list[self.best_ind_].best_params_ - print(f'Best estimator {self.best_estimator_} and best score {self.best_score_} ' - f'and best params {self.best_params_}') - return self - - def scaler_transform(self, X): - if self.scaling: - return self.scaler.transform(X) - - def best_model(self): - return self.best_estimator_ - - def predict(self, X): - if self.scaling: - return self.best_estimator_.predict(self.scaler.transform(X)) - return self.best_estimator_.predict(X) - - def predict_proba(self, X): - return self.best_estimator_.predict_proba(X) - - class GridSearchCVList(BaseEstimator): """ An extension of GridSearchCV that allows for passing a list of estimators each with their own parameter grid and returns the best among all estimators in the list and hyperparameter in their diff --git a/econml/sklearn_extensions/model_selection_utils.py b/econml/sklearn_extensions/model_selection_utils.py deleted file mode 100644 index ab3f567d8..000000000 --- a/econml/sklearn_extensions/model_selection_utils.py +++ /dev/null @@ -1,817 +0,0 @@ - -import warnings -from sklearn.exceptions import NotFittedError -import numpy as np -import sklearn -import sklearn.ensemble -import sklearn.linear_model -import sklearn.neural_network -import sklearn.preprocessing -from sklearn.base import BaseEstimator, is_regressor, is_classifier -from sklearn.ensemble import (GradientBoostingClassifier, - GradientBoostingRegressor, - RandomForestClassifier, RandomForestRegressor) -from sklearn.linear_model import (ElasticNetCV, - LogisticRegression, - LogisticRegressionCV, MultiTaskElasticNetCV) -from sklearn.model_selection import (BaseCrossValidator, GridSearchCV, - RandomizedSearchCV, - check_cv) -from sklearn.neural_network import MLPClassifier, MLPRegressor -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import (PolynomialFeatures, - StandardScaler) -from sklearn.svm import SVC, LinearSVC -import inspect -from sklearn.exceptions import NotFittedError -from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier -from sklearn.model_selection import KFold -import pandas as pd - - -def select_continuous_estimator(estimator_type, random_state): - """ - Returns a continuous estimator object for the specified estimator type. - - Parameters - ---------- - estimator_type (str): The type of estimator to use, one of: 'linear', 'forest', 'gbf', 'nnet', 'poly'. - TODO Add Random State for parameter - Returns - ---------- - object: An instance of the selected estimator class. - - Raises: - ValueError: If the estimator type is unsupported. - """ - if estimator_type == 'linear': - return (ElasticNetCV(random_state=random_state)) - elif estimator_type == 'forest': - return RandomForestRegressor(random_state=random_state) - elif estimator_type == 'gbf': - return GradientBoostingRegressor(random_state=random_state) - elif estimator_type == 'nnet': - return (MLPRegressor(random_state=random_state)) - elif estimator_type == 'poly': - poly = PolynomialFeatures() - linear = ElasticNetCV(random_state=random_state) # Play around with precompute and tolerance - return (Pipeline([('poly', poly), ('linear', linear)])) - elif estimator_type == 'weighted_lasso': - from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper - return WeightedLassoCVWrapper(random_state=random_state) - else: - raise ValueError(f"Unsupported estimator type: {estimator_type}") - - -def select_discrete_estimator(estimator_type, random_state): - """ - Returns a discrete estimator object for the specified estimator type. - - Parameters - ---------- - estimator_type (str): The type of estimator to use, one of: 'linear', 'forest', 'gbf', 'nnet', 'poly'. - TODO Add Random State for parameter - Returns - ---------- - object: An instance of the selected estimator class. - - Raises: - ValueError: If the estimator type is unsupported. - """ - - if estimator_type == 'linear': - return (LogisticRegressionCV(cv=KFold(random_state=random_state), - multi_class='auto', random_state=random_state)) - elif estimator_type == 'forest': - return RandomForestClassifier(random_state=random_state) - elif estimator_type == 'gbf': - return GradientBoostingClassifier(random_state=random_state) - elif estimator_type == 'nnet': - return (MLPClassifier(random_state=random_state)) - elif estimator_type == 'poly': - poly = PolynomialFeatures() - linear = (LogisticRegressionCV(cv=KFold(random_state=random_state), - multi_class='auto', random_state=random_state)) - return (Pipeline([('poly', poly), ('linear', linear)])) - else: - raise ValueError(f"Unsupported estimator type: {estimator_type}") - - -def select_estimator(estimator_type, is_discrete, random_state): - """ - Returns an estimator object for the specified estimator and target types. - - Parameters - ---------- - estimator_type (str): The type of estimator to use, one of: 'linear', 'forest', - 'gbf', 'nnet', 'poly', 'automl', 'all'. - is_discrete (bool): The type of target variable, if true then it's discrete. - TODO Add Random State for parameter - Returns - ---------- - object: An instance of the selected estimator class. - - Raises: - ValueError: If the estimator or target types are unsupported. - """ - if not isinstance(is_discrete, bool): - raise ValueError(f"Unsupported target type: {type(is_discrete)}. is_discrete should be of type bool.") - elif is_discrete: - return select_discrete_estimator(estimator_type=estimator_type, random_state=random_state) - else: - return select_continuous_estimator(estimator_type=estimator_type, random_state=random_state) - - -def is_likely_estimator(estimator): - """ - Check if an object is likely to be an estimator. - - This function checks if an object has 'fit' and 'predict' methods, or if it is an instance of BaseEstimator. - - Parameters - ---------- - estimator : object - The object to check. - - Returns - ------- - bool - True if the object is likely to be an estimator, False otherwise. - """ - - required_methods = ['fit', 'predict'] - return all(hasattr(estimator, method) for method in required_methods) or isinstance(estimator, BaseEstimator) - - -def check_list_type(lst): - """ - Checks if a list only contains strings, sklearn model objects, and sklearn model selection objects. - - Parameters - ---------- - lst (list): A list to check. - - Returns - ---------- - bool: True if the list only contains valid objects, False otherwise. - - Raises: - TypeError: If the list contains objects other than strings, sklearn model objects, - or sklearn model selection objects. - - Examples: - >>> check_list_type(['linear', RandomForestRegressor(), KFold()]) - True - >>> check_list_type([1, 'linear']) - TypeError: The list must contain only strings, sklearn model objects, and sklearn model selection objects. - """ - if len(lst) == 0: - raise ValueError("Estimator list is empty. Please add some models or use some of the defaults provided.") - - for element in lst: - if (not isinstance(element, (str, BaseCrossValidator))): - if not is_likely_estimator(element): - raise TypeError( - "The list must contain only strings, sklearn model objects, and sklearn model selection objects. " - f"Invalid element: {element}") - return True - - -def get_complete_estimator_list(estimator_list, is_discrete, random_state): - ''' - Returns a list of sklearn objects from an input list of str's, and sklearn objects. - - Parameters - ---------- - estimator_list : List of estimators; can be sklearn object or str: 'linear', 'forest', 'gbf', - 'nnet', 'poly', 'auto', 'all'. - is_discrete (bool): if target type is discrete or continuous. - - Returns - ---------- - object: A list of sklearn objects - - Raises: - ValueError: If the estimator is not supported. - - ''' - if isinstance(estimator_list, str): - if 'all' == estimator_list: - estimator_list = ['linear', 'forest', 'gbf', 'nnet', 'poly'] - elif 'auto' == estimator_list: - estimator_list = ['linear'] - elif estimator_list in ['linear', 'forest', 'gbf', 'nnet', 'poly']: - estimator_list = [estimator_list] - else: - raise ValueError( - "Invalid estimator_list value. Please provide a valid value from the list of available estimators: " - "['linear', 'forest', 'gbf', 'nnet', 'poly', 'automl']") - elif isinstance(estimator_list, list): - if 'auto' in estimator_list: - for estimator in ['linear']: - if estimator not in estimator_list: - estimator_list.append(estimator) - if 'all' in estimator_list: - for estimator in ['linear', 'forest', 'gbf', 'nnet', 'poly']: - if estimator not in estimator_list: - estimator_list.append(estimator) - - elif is_likely_estimator(estimator_list): - estimator_list = [estimator_list] - else: - raise ValueError(f"Incorrect type: {type(estimator_list)}") - check_list_type(estimator_list) - temp_est_list = [] - - if not isinstance(estimator_list, list): - raise ValueError(f"estimator_list should be of type list not: {type(estimator_list)}") - - # Set to remove duplicates - for estimator in set(estimator_list): - # if sklearn object: add to list, else turn str into corresponding sklearn object and add to list - if isinstance(estimator, BaseCrossValidator) or is_likely_estimator(estimator): - temp_est_list.append(estimator) - else: - temp_est_list.append(select_estimator(estimator_type=estimator, - is_discrete=is_discrete, random_state=random_state)) - temp_est_list = flatten_list(temp_est_list) - - # Check that all types of models are matched towards the problem. - for estimator in temp_est_list: - if (isinstance(estimator, BaseEstimator)): - if not is_regressor_or_classifier(estimator, is_discrete=is_discrete): - raise TypeError(f"Invalid estimator type: {type(estimator)} - must be a regressor or classifier") - return temp_est_list - - -def select_classification_hyperparameters(estimator): - """ - Returns a hyperparameter grid for the specified classification model type. - - Parameters - ---------- - model_type (str): The type of model to be used. Valid values are 'linear', 'forest', 'nnet', and 'poly'. - - Returns - ---------- - A dictionary representing the hyperparameter grid to search over. - """ - - if isinstance(estimator, LogisticRegressionCV): - return { - 'Cs': [0.01, 0.1, 1], - 'cv': [3], - 'penalty': ['l1', 'l2', 'elasticnet'], - 'solver': ['lbfgs', 'liblinear', 'saga'] - } - elif isinstance(estimator, RandomForestClassifier): - return { - 'n_estimators': [100, 500], - 'max_depth': [None, 5, 10, 20], - 'min_samples_split': [2, 5], - 'min_samples_leaf': [1, 2] - } - elif isinstance(estimator, GradientBoostingClassifier): - return { - 'n_estimators': [100, 500], - 'learning_rate': [0.01, 0.05, 0.1], - 'max_depth': [3, 5, 7], - - } - elif isinstance(estimator, MLPClassifier): - return { - 'hidden_layer_sizes': [(10,), (50,), (100,)], - 'alpha': [0.0001, 0.01], - 'learning_rate': ['constant', 'adaptive'] - } - elif is_polynomial_pipeline(estimator=estimator): - return { - 'poly__degree': [2, 3, 4], - 'linear__max_iter': [100, 200], - 'linear__penalty': ['l2'], - 'linear__solver': ['saga', 'lbfgs'] - } - else: - warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for " - "LogisticRegressionCV, RandomForestClassifier, MLPClassifier, and the polynomial pipleine", - category=UserWarning) - return {} - # raise ValueError("Invalid model type. Valid values are 'linear', 'forest', 'nnet', and 'poly'.") - - -def select_regression_hyperparameters(estimator): - """ - Returns a dictionary of hyperparameters to be searched over for a regression model. - - Parameters - ---------- - model_type (str): The type of model to be used. Valid values are 'linear', 'forest', 'nnet', and 'poly'. - - Returns - ---------- - A dictionary of hyperparameters to be searched over using a grid search. - """ - if isinstance(estimator, ElasticNetCV): - return { - 'l1_ratio': [0.1, 0.5, 0.9], - 'cv': [3], - 'max_iter': [1000], - } - elif isinstance(estimator, RandomForestRegressor): - return { - 'n_estimators': [100], - 'max_depth': [None, 10, 50], - 'min_samples_split': [2, 5, 10], - } - elif isinstance(estimator, MLPRegressor): - return { - 'hidden_layer_sizes': [(10,), (50,), (100,)], - 'alpha': [0.0001, 0.01], - 'learning_rate': ['constant', 'adaptive'] - } - elif isinstance(estimator, GradientBoostingRegressor): - return { - 'n_estimators': [100, 500], - 'learning_rate': [0.01, 0.1, 0.05], - 'max_depth': [3, 5], - } - elif is_polynomial_pipeline(estimator=estimator): - return { - 'linear__l1_ratio': [0.1, 0.5, 0.9], - 'linear__max_iter': [1000], - 'poly__degree': [2, 3, 4] - } - else: - warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for " - "ElasticNetCV, RandomForestRegressor, MLPRegressor, and the polynomial pipeline.", - category=UserWarning) - return {} - - -def flatten_list(lst): - """ - Flatten a list that may contain nested lists. - - Parameters - ---------- - lst (list): The list to flatten. - - Returns - ---------- - list: The flattened list. - """ - flattened = [] - for item in lst: - if isinstance(item, list): - flattened.extend(flatten_list(item)) - else: - flattened.append(item) - return flattened - - -def auto_hyperparameters(estimator_list, is_discrete=True): - """ - Selects hyperparameters for a list of estimators. - - Parameters - ---------- - - estimator_list: list of scikit-learn estimators - - is_discrete: boolean indicating whether the problem is classification or regression - - Returns - ---------- - - param_list: list of parameter grids for the estimators - """ - param_list = [] - for estimator in estimator_list: - if is_discrete: - param_list.append(select_classification_hyperparameters(estimator=estimator)) - else: - param_list.append(select_regression_hyperparameters(estimator=estimator)) - return param_list - - -def set_search_hyperparameters(search_object, hyperparameters): - if isinstance(search_object, (RandomizedSearchCV, GridSearchCV)): - search_object.set_params(**hyperparameters) - else: - raise ValueError("Invalid search object") - - -def is_mlp(estimator): - return isinstance(estimator, (MLPClassifier, MLPRegressor)) - - -def has_random_state(model): - """ - Check if a model has a 'random_state' parameter. - - This function inspects the model's signature to check if it has a 'random_state' parameter. - - Parameters - ---------- - model : object - The model to check. - - Returns - ------- - bool - True if the model has a 'random_state' parameter, False otherwise. - """ - - if is_polynomial_pipeline(model): - signature = inspect.signature(type(model['linear'])) - else: - signature = inspect.signature(type(model)) - return ("random_state" in signature.parameters) - - -def supports_sample_weight(estimator): - """ - Check if a model supports 'sample_weight'. - - This function inspects the signature of the model's 'fit' method to check if it supports 'sample_weight'. - - Parameters - ---------- - model : object - The model to check. - - Returns - ------- - bool - True if the model supports 'sample_weight', False otherwise. - """ - - fit_signature = inspect.signature(estimator.fit) - return 'sample_weight' in fit_signature.parameters - - -def just_one_model_no_params(estimator_list, param_list): - """ - Check if there is only one model and the parameter list is empty. - - This function checks if the length of the model and parameter list is 1 and 0 respectively. - - Parameters - ---------- - estimator_list : list - List of models. - - param_list : list - List of parameters. - - Returns - ------- - bool - True if there is only one model and the parameter list is empty, False otherwise. - """ - - return (len(estimator_list) == 1) and (len(param_list) == 1) and (len(param_list[0]) == 0) - - -def param_grid_is_empty(param_grid): - """ - Check if a parameter grid is empty. - - This function checks if the length of the parameter grid is 0. - - Parameters - ---------- - param_grid : dict - Parameter grid to check. - - Returns - ------- - bool - True if the parameter grid is empty, False otherwise. - """ - - return len(param_grid) == 0 - - -def is_linear_model(estimator): - """ - Check if a model is a linear model. - - This function checks if a model has 'fit_intercept' and 'coef_' attributes or if it is an instance of - LogisticRegression, LinearSVC, or SVC. - - Parameters - ---------- - model : object - The model to check. - - Returns - ------- - bool - True if the model is a linear model, False otherwise. - """ - - if isinstance(estimator, Pipeline): - has_poly_feature_step = any(isinstance(step[1], PolynomialFeatures) for step in estimator.steps) - if has_poly_feature_step: - return True - - if hasattr(estimator, 'fit_intercept') and hasattr(estimator, 'coef_'): - return True - - if isinstance(estimator, (LogisticRegression, LinearSVC, SVC)): - return True - - return False - - -def is_data_scaled(X): - """ - Check if input data is scaled. - - This function checks if the input data is scaled by comparing its mean and standard deviation to - 0 and 1 respectively. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Input data. - - Returns - ------- - bool - True if the input data is scaled, False otherwise. - - """ - mean = np.mean(X, axis=0) - std = np.std(X, axis=0) - - is_scaled = np.allclose(mean, 0.0) and np.allclose(std, 1.0) - - return is_scaled - - -def is_regressor_or_classifier(model, is_discrete): - """ - Check if a model is a regressor or classifier. - - This function checks if a model is a regressor or classifier depending on the 'is_discrete' parameter. - - Parameters - ---------- - model : object - The model to check. - - is_discrete : bool - If True, checks if the model is a classifier. If False, checks if the model is a regressor. - - Returns - ------- - bool - True if the model matches the type specified by 'is_discrete', False otherwise. - """ - - if is_discrete: - if is_polynomial_pipeline(model): - return is_classifier(model[1]) - else: - return is_classifier(model) - else: - if is_polynomial_pipeline(model): - return is_regressor(model[1]) - else: - return is_regressor(model) - - -def scale_pipeline(model): - """ - Returns a pipeline that scales the input data using StandardScaler and applies the given model. - - Parameters - ---------- - model : estimator object - A model object that implements the scikit-learn estimator interface. - - Returns - ---------- - pipe : Pipeline object - A pipeline that scales the input data using StandardScaler and applies the given model. - """ - pipe = Pipeline([('scaler', StandardScaler()), ('model', model)]) - return pipe - - -def is_polynomial_pipeline(estimator): - """ - Check if a model is a polynomial pipeline. - - This function checks if a model is a pipeline that includes a PolynomialFeatures step. - - Parameters - ---------- - model : object - The model to check. - - Returns - ------- - bool - True if the model is a polynomial pipeline, False otherwise. - """ - - if not isinstance(estimator, Pipeline): - return False - steps = estimator.steps - if len(steps) != 2: - return False - poly_step = steps[0] - if not isinstance(poly_step[1], PolynomialFeatures): - return False - return True - - -def is_likely_multi_task(y): - """ - Check if a target array is likely multi-task. - - This function checks if a target array is likely to be multi-task by checking its shape. - - Parameters - ---------- - y : array-like - The target array to check. - - Returns - ------- - bool - True if the target array is likely multi-task, False otherwise. - """ - - if len(y.shape) == 2: - if y.shape[1] > 1: - return True - return False - - -def can_handle_multitask(model, is_discrete=False): - """ - Check if a model can handle multi-task output. - - This function checks if a model can handle multi-task output by trying to fit and predict on random data. - - Parameters - ---------- - model : object - The model to check. - - Returns - ------- - bool - True if the model can handle multi-task output, False otherwise. - """ - - X = np.random.rand(10, 3) - if is_discrete: - y = np.random.randint(0, 2, (10, 2)) - else: - y = np.random.rand(10, 2) - - try: - model.fit(X, y) - except Exception as e: - return False - - try: - model.predict(X) - except Exception as e: - # warnings.warn(f"The model {model.__class__.__name__} is not properly fitted. Error: {e}") - return False - return True - - -def pipeline_convert_to_multitask(pipeline): - """ - Convert a pipeline to handle multi-task output if possible. - - This function iterates over the steps in the input pipeline. If a step is a - polynomial transformer, it adds the step to the new pipeline as is. If the - step is an estimator, it attempts to convert it to handle multi-task output - and adds the converted estimator to the new pipeline. - - Parameters - ---------- - pipeline : sklearn.Pipeline - The pipeline to convert. - - Returns - ------- - sklearn.Pipeline - The converted pipeline. - - Raises - ------ - ValueError - If an unknown error occurs when making model multi-task. - """ - - steps = list(pipeline.steps) - if isinstance(steps[-1][1], (LogisticRegressionCV)): - steps[-1] = ('linear', MultiOutputClassifier(steps[-1][1])) - if isinstance(steps[-1][1], (ElasticNetCV)): - steps[-1] = ('linear', MultiTaskElasticNetCV()) - new_pipeline = Pipeline(steps) - - return new_pipeline - - -def make_model_multi_task(model, is_discrete): - """ - Convert a model to handle multi-task output if possible. - - This function converts a model to handle multi-task output if possible. - - Parameters - ---------- - model : object - The model to convert. - - is_discrete : bool - If True, the model is treated as a classifier. If False, the model is treated as a regressor. - - Returns - ------- - object - The converted model if possible, raises an error otherwise. - """ - - try: - if is_discrete: - if is_polynomial_pipeline(model): - return pipeline_convert_to_multitask(model) - return MultiOutputClassifier(model) - else: - if isinstance(model, ElasticNetCV): - return MultiTaskElasticNetCV() - elif is_polynomial_pipeline(model): - return pipeline_convert_to_multitask(model) - else: - return MultiOutputRegressor(model) - except Exception as e: - raise ValueError("An unknown error occurred when making model multitask.") from e - - -def make_param_multi_task(estimator, param_grid): - """ - Convert the keys in a parameter grid to work with a multi-task model. - - This function converts the keys in a parameter grid to work with a multi-task model by prepending - 'estimator__' to each key. - - Parameters - ---------- - estimator : object - The estimator the parameter grid is for. - - param_grid : dict - The parameter grid to convert. - - Returns - ------- - dict - The converted parameter grid. - """ - - if isinstance(estimator, ElasticNetCV): - return param_grid - else: - param_grid_multi = {f'estimator__{k}': v for k, v in param_grid.items()} - return param_grid_multi - - -def preprocess_and_encode(data, cat_indices=None): - """ - Detects categorical columns, one-hot encodes them, and returns the preprocessed data. - - Parameters: - - data: pandas DataFrame or numpy array - - cat_indices: list of column indices (or names for DataFrame) to be considered categorical - - Returns: - - Preprocessed data in the format of the original input (DataFrame or numpy array) - """ - was_numpy = False - if isinstance(data, np.ndarray): - was_numpy = True - data = pd.DataFrame(data) - - # If cat_indices is None, detect categorical columns using object type as a heuristic - if cat_indices is None: - cat_columns = data.select_dtypes(['object']).columns.tolist() - else: - if all(isinstance(i, int) for i in cat_indices): # if cat_indices are integer indices - cat_columns = data.columns[cat_indices].tolist() - else: # assume cat_indices are column names - cat_columns = cat_indices - - data_encoded = pd.get_dummies(data, columns=cat_columns) - - if was_numpy: - return data_encoded.values - else: - return data_encoded From fe63f23a57bc05d096038fe4a99bcaa3de597882 Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Thu, 9 Nov 2023 17:54:16 -0500 Subject: [PATCH 09/19] Fix linting Signed-off-by: Keith Battocchi --- econml/tests/test_dml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index 975323a05..be20b2639 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -1129,7 +1129,7 @@ def _test_sparse(n_p, d_w, n_r): t[fold * n:(fold + 1) * n] = t_f # we have quadratic terms in y, so we need to pipeline with a quadratic featurizer - dml = SparseLinearDML(model_y=Pipeline([('poly', PolynomialFeatures(2)), + dml = SparseLinearDML(model_y=Pipeline([('poly', PolynomialFeatures(2)), ('lr', LinearRegression(fit_intercept=False))]), model_t=LinearRegression(fit_intercept=False), fit_cate_intercept=False) From 6f6a5148477967b747e2e658f57286a744940a9c Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Fri, 10 Nov 2023 09:47:06 -0500 Subject: [PATCH 10/19] Speed up tests by doing less model selection Signed-off-by: Keith Battocchi --- econml/tests/test_dmliv.py | 16 +++++++++++++++- econml/tests/test_driv.py | 29 +++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/econml/tests/test_dmliv.py b/econml/tests/test_dmliv.py index 16f8f55a9..8246db428 100644 --- a/econml/tests/test_dmliv.py +++ b/econml/tests/test_dmliv.py @@ -8,7 +8,7 @@ import pytest from scipy import special from sklearn.ensemble import RandomForestRegressor -from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegression, LogisticRegressionCV from sklearn.preprocessing import PolynomialFeatures from econml.iv.dml import OrthoIV, DMLIV, NonParamDMLIV @@ -62,26 +62,40 @@ def eff_shape(n, d_x, d_y): None, PolynomialFeatures(degree=2, include_bias=False), ]: + # since we're running so many combinations, just use LassoCV/LogisticRegressionCV for the models + # instead of also selecting over random forest models est_list = [ OrthoIV( + model_y_xw=LassoCV(), + model_t_xw=LogisticRegressionCV() if binary_T else LassoCV(), + model_z_xw=LogisticRegressionCV() if binary_Z else LassoCV(), projection=False, featurizer=featurizer, discrete_treatment=binary_T, discrete_instrument=binary_Z, ), OrthoIV( + model_y_xw=LassoCV(), + model_t_xw=LogisticRegressionCV() if binary_T else LassoCV(), + model_t_xwz=LogisticRegressionCV() if binary_T else LassoCV(), projection=True, featurizer=featurizer, discrete_treatment=binary_T, discrete_instrument=binary_Z, ), DMLIV( + model_y_xw=LassoCV(), + model_t_xw=LogisticRegressionCV() if binary_T else LassoCV(), + model_t_xwz=LogisticRegressionCV() if binary_T else LassoCV(), model_final=LinearRegression(fit_intercept=False), featurizer=featurizer, discrete_treatment=binary_T, discrete_instrument=binary_Z, ), NonParamDMLIV( + model_y_xw=LassoCV(), + model_t_xw=LogisticRegressionCV() if binary_T else LassoCV(), + model_t_xwz=LogisticRegressionCV() if binary_T else LassoCV(), model_final=RandomForestRegressor(), featurizer=featurizer, discrete_treatment=binary_T, diff --git a/econml/tests/test_driv.py b/econml/tests/test_driv.py index 38bb8421a..17185b1f5 100644 --- a/econml/tests/test_driv.py +++ b/econml/tests/test_driv.py @@ -74,7 +74,13 @@ def eff_shape(n, d_x): Z = np.random.normal(size=(n,)) est_list = [ + # we're running a lot of tests, so use fixed models instead of model selection DRIV( + model_y_xw=LinearRegression(), + model_t_xw=LinearRegression(), + model_tz_xw=LinearRegression(), + model_t_xwz=LinearRegression() if projection else "auto", + model_z_xw=LinearRegression() if not projection else "auto", flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), model_final=StatsModelsLinearRegression( fit_intercept=False @@ -88,6 +94,11 @@ def eff_shape(n, d_x): use_ray=use_ray, ), LinearDRIV( + model_y_xw=LinearRegression(), + model_t_xw=LinearRegression(), + model_tz_xw=LinearRegression(), + model_t_xwz=LinearRegression() if projection else "auto", + model_z_xw=LinearRegression() if not projection else "auto", flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), fit_cate_intercept=True, projection=projection, @@ -98,6 +109,11 @@ def eff_shape(n, d_x): use_ray=use_ray, ), SparseLinearDRIV( + model_y_xw=LinearRegression(), + model_t_xw=LinearRegression(), + model_tz_xw=LinearRegression(), + model_t_xwz=LinearRegression() if projection else "auto", + model_z_xw=LinearRegression() if not projection else "auto", flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), fit_cate_intercept=True, projection=projection, @@ -108,6 +124,11 @@ def eff_shape(n, d_x): use_ray=use_ray, ), ForestDRIV( + model_y_xw=LinearRegression(), + model_t_xw=LinearRegression(), + model_tz_xw=LinearRegression(), + model_t_xwz=LinearRegression() if projection else "auto", + model_z_xw=LinearRegression() if not projection else "auto", flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), projection=projection, fit_cov_directly=fit_cov_directly, @@ -125,6 +146,8 @@ def eff_shape(n, d_x): if binary_T and binary_Z and not fit_cov_directly: est_list += [ IntentToTreatDRIV( + model_y_xw=LinearRegression(), + model_t_xwz=LinearRegression(), flexible_model_effect=StatsModelsLinearRegression( fit_intercept=False ), @@ -133,6 +156,8 @@ def eff_shape(n, d_x): use_ray=use_ray, ), LinearIntentToTreatDRIV( + model_y_xw=LinearRegression(), + model_t_xwz=LinearRegression(), flexible_model_effect=StatsModelsLinearRegression( fit_intercept=False ), @@ -283,8 +308,8 @@ def test_fit_cov_directly(self): # fitting the covariance directly should be at least as good as computing the covariance from separate models # set the models so that model selection over random forests doesn't take too much time in the repeated trials - est = LinearDRIV(model_y_xw=LassoCV(), model_t_xw=LassoCV(), model_z_xw=LassoCV(), - model_tz_xw=LassoCV()) + est = LinearDRIV(model_y_xw=LinearRegression(), model_t_xw=LinearRegression(), model_z_xw=LinearRegression(), + model_tz_xw=LinearRegression()) n = 500 p = 10 From 2451faa9c79da8fddd42be8da0eba001be6ddcba Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Fri, 10 Nov 2023 09:51:48 -0500 Subject: [PATCH 11/19] Ensure use of models that can fit arrays and vectors in DMLIV tests Signed-off-by: Keith Battocchi --- econml/tests/test_dmliv.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/econml/tests/test_dmliv.py b/econml/tests/test_dmliv.py index 8246db428..e2dc6c21e 100644 --- a/econml/tests/test_dmliv.py +++ b/econml/tests/test_dmliv.py @@ -8,12 +8,12 @@ import pytest from scipy import special from sklearn.ensemble import RandomForestRegressor -from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegression, LogisticRegressionCV +from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV from sklearn.preprocessing import PolynomialFeatures from econml.iv.dml import OrthoIV, DMLIV, NonParamDMLIV from econml.iv.dr._dr import _DummyCATE -from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression +from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression, WeightedLassoCVWrapper from econml.utilities import shape from econml.tests.utilities import GroupingModel @@ -62,40 +62,40 @@ def eff_shape(n, d_x, d_y): None, PolynomialFeatures(degree=2, include_bias=False), ]: - # since we're running so many combinations, just use LassoCV/LogisticRegressionCV for the models - # instead of also selecting over random forest models + # since we're running so many combinations, just use LassoCV/LogisticRegressionCV + # for the models instead of also selecting over random forest models est_list = [ OrthoIV( - model_y_xw=LassoCV(), - model_t_xw=LogisticRegressionCV() if binary_T else LassoCV(), - model_z_xw=LogisticRegressionCV() if binary_Z else LassoCV(), + model_y_xw=WeightedLassoCVWrapper(), + model_t_xw=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), + model_z_xw=LogisticRegressionCV() if binary_Z else WeightedLassoCVWrapper(), projection=False, featurizer=featurizer, discrete_treatment=binary_T, discrete_instrument=binary_Z, ), OrthoIV( - model_y_xw=LassoCV(), - model_t_xw=LogisticRegressionCV() if binary_T else LassoCV(), - model_t_xwz=LogisticRegressionCV() if binary_T else LassoCV(), + model_y_xw=WeightedLassoCVWrapper(), + model_t_xw=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), + model_t_xwz=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), projection=True, featurizer=featurizer, discrete_treatment=binary_T, discrete_instrument=binary_Z, ), DMLIV( - model_y_xw=LassoCV(), - model_t_xw=LogisticRegressionCV() if binary_T else LassoCV(), - model_t_xwz=LogisticRegressionCV() if binary_T else LassoCV(), + model_y_xw=WeightedLassoCVWrapper(), + model_t_xw=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), + model_t_xwz=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), model_final=LinearRegression(fit_intercept=False), featurizer=featurizer, discrete_treatment=binary_T, discrete_instrument=binary_Z, ), NonParamDMLIV( - model_y_xw=LassoCV(), - model_t_xw=LogisticRegressionCV() if binary_T else LassoCV(), - model_t_xwz=LogisticRegressionCV() if binary_T else LassoCV(), + model_y_xw=WeightedLassoCVWrapper(), + model_t_xw=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), + model_t_xwz=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), model_final=RandomForestRegressor(), featurizer=featurizer, discrete_treatment=binary_T, From 6d4a2033e71b8ecb0d5c041bcf89ef56e3d89d2e Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Fri, 10 Nov 2023 11:03:51 -0500 Subject: [PATCH 12/19] Fix tests Signed-off-by: Keith Battocchi --- econml/_ortho_learner.py | 2 +- econml/dml/_rlearner.py | 5 +++-- econml/iv/dr/_dr.py | 12 +++++++----- econml/tests/test_driv.py | 40 +++++++++++++++++++++------------------ 4 files changed, 33 insertions(+), 26 deletions(-) diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py index 270fd5d84..d41ec66c9 100644 --- a/econml/_ortho_learner.py +++ b/econml/_ortho_learner.py @@ -179,7 +179,7 @@ def _crossfit(model: ModelSelector, folds, use_ray, ray_remote_fun_option, *args class Wrapper: def __init__(self, model): self._model = model - def fit(self, is_selecting, X, y, W=None): + def train(self, is_selecting, X, y, W=None): self._model.fit(X, y) return self def predict(self, X, y, W=None): diff --git a/econml/dml/_rlearner.py b/econml/dml/_rlearner.py index b1bc9e2ad..159763a6c 100644 --- a/econml/dml/_rlearner.py +++ b/econml/dml/_rlearner.py @@ -203,6 +203,7 @@ class _RLearner(_OrthoLearner): import numpy as np from sklearn.linear_model import LinearRegression from econml.dml._rlearner import _RLearner + from econml.sklearn_extensions.model_selection import get_selector from sklearn.base import clone class ModelFirst: def __init__(self, model): @@ -221,9 +222,9 @@ def predict(self, X): return self.model.predict(X) class RLearner(_RLearner): def _gen_model_y(self): - return ModelFirst(LinearRegression()) + return get_selector(ModelFirst(LinearRegression())) def _gen_model_t(self): - return ModelFirst(LinearRegression()) + return get_selector(ModelFirst(LinearRegression())) def _gen_rlearner_model_final(self): return ModelFinal() np.random.seed(123) diff --git a/econml/iv/dr/_dr.py b/econml/iv/dr/_dr.py index b4e2c8a40..8e48f905e 100644 --- a/econml/iv/dr/_dr.py +++ b/econml/iv/dr/_dr.py @@ -668,7 +668,7 @@ def _gen_ortho_learner_model_nuisance(self): model_t_xw = _make_first_stage_selector(self.model_t_xw, self.discrete_treatment, self.random_state) if self.projection: - # this is a regression model since proj_t is probability + # this is a regression model since the instrument E[T|X,W,Z] is always continuous model_tz_xw = _make_first_stage_selector(self.model_tz_xw, is_discrete=False, random_state=self.random_state) @@ -679,12 +679,14 @@ def _gen_ortho_learner_model_nuisance(self): random_state=self.random_state) else: - model_tz_xw = _make_first_stage_selector(self.model_tz_xw, is_discrete=(self.discrete_treatment and - self.discrete_instrument and - not self.fit_cov_directly), + model_tz_xw = _make_first_stage_selector(self.model_tz_xw, + is_discrete=(self.discrete_treatment and + self.discrete_instrument and + not self.fit_cov_directly), random_state=self.random_state) - model_z = _make_first_stage_selector(self.model_z_xw, is_discrete=self.discrete_instrument, + model_z = _make_first_stage_selector(self.model_z_xw, + is_discrete=self.discrete_instrument, random_state=self.random_state) return _BaseDRIVNuisanceSelector(prel_model_effect=self._gen_prel_model_effect(), diff --git a/econml/tests/test_driv.py b/econml/tests/test_driv.py index 17185b1f5..2b2eba26d 100644 --- a/econml/tests/test_driv.py +++ b/econml/tests/test_driv.py @@ -77,10 +77,11 @@ def eff_shape(n, d_x): # we're running a lot of tests, so use fixed models instead of model selection DRIV( model_y_xw=LinearRegression(), - model_t_xw=LinearRegression(), - model_tz_xw=LinearRegression(), - model_t_xwz=LinearRegression() if projection else "auto", - model_z_xw=LinearRegression() if not projection else "auto", + model_t_xw=LogisticRegression() if binary_T else LinearRegression(), + model_tz_xw=LogisticRegression() if binary_T and binary_Z and not ( + projection or fit_cov_directly) else LinearRegression(), + model_t_xwz="auto" if not projection else LogisticRegression() if binary_T else LinearRegression(), + model_z_xw="auto" if projection else LogisticRegression() if binary_Z else LinearRegression(), flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), model_final=StatsModelsLinearRegression( fit_intercept=False @@ -95,10 +96,11 @@ def eff_shape(n, d_x): ), LinearDRIV( model_y_xw=LinearRegression(), - model_t_xw=LinearRegression(), - model_tz_xw=LinearRegression(), - model_t_xwz=LinearRegression() if projection else "auto", - model_z_xw=LinearRegression() if not projection else "auto", + model_t_xw=LogisticRegression() if binary_T else LinearRegression(), + model_tz_xw=LogisticRegression() if binary_T and binary_Z and not ( + projection or fit_cov_directly) else LinearRegression(), + model_t_xwz="auto" if not projection else LogisticRegression() if binary_T else LinearRegression(), + model_z_xw="auto" if projection else LogisticRegression() if binary_Z else LinearRegression(), flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), fit_cate_intercept=True, projection=projection, @@ -110,10 +112,11 @@ def eff_shape(n, d_x): ), SparseLinearDRIV( model_y_xw=LinearRegression(), - model_t_xw=LinearRegression(), - model_tz_xw=LinearRegression(), - model_t_xwz=LinearRegression() if projection else "auto", - model_z_xw=LinearRegression() if not projection else "auto", + model_t_xw=LogisticRegression() if binary_T else LinearRegression(), + model_tz_xw=LogisticRegression() if binary_T and binary_Z and not ( + projection or fit_cov_directly) else LinearRegression(), + model_t_xwz="auto" if not projection else LogisticRegression() if binary_T else LinearRegression(), + model_z_xw="auto" if projection else LogisticRegression() if binary_Z else LinearRegression(), flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), fit_cate_intercept=True, projection=projection, @@ -125,10 +128,11 @@ def eff_shape(n, d_x): ), ForestDRIV( model_y_xw=LinearRegression(), - model_t_xw=LinearRegression(), - model_tz_xw=LinearRegression(), - model_t_xwz=LinearRegression() if projection else "auto", - model_z_xw=LinearRegression() if not projection else "auto", + model_t_xw=LogisticRegression() if binary_T else LinearRegression(), + model_tz_xw=LogisticRegression() if binary_T and binary_Z and not ( + projection or fit_cov_directly) else LinearRegression(), + model_t_xwz="auto" if not projection else LogisticRegression() if binary_T else LinearRegression(), + model_z_xw="auto" if projection else LogisticRegression() if binary_Z else LinearRegression(), flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), projection=projection, fit_cov_directly=fit_cov_directly, @@ -147,7 +151,7 @@ def eff_shape(n, d_x): est_list += [ IntentToTreatDRIV( model_y_xw=LinearRegression(), - model_t_xwz=LinearRegression(), + model_t_xwz=LogisticRegression(), flexible_model_effect=StatsModelsLinearRegression( fit_intercept=False ), @@ -157,7 +161,7 @@ def eff_shape(n, d_x): ), LinearIntentToTreatDRIV( model_y_xw=LinearRegression(), - model_t_xwz=LinearRegression(), + model_t_xwz=LogisticRegression(), flexible_model_effect=StatsModelsLinearRegression( fit_intercept=False ), From 818ff9c8bd02459dfa41d56ecd0c9e9ccfa3ed0b Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Fri, 10 Nov 2023 11:29:02 -0500 Subject: [PATCH 13/19] Speed up tests Signed-off-by: Keith Battocchi --- econml/tests/test_dml.py | 2 +- econml/tests/test_treatment_featurization.py | 71 +++++++++++++------- 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index be20b2639..5867f4d40 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -159,7 +159,7 @@ def make_random(n, is_discrete, d): True, ['auto']), (LinearDML(model_y=Lasso(), - model_t='auto', + model_t=model_t, featurizer=featurizer, fit_cate_intercept=fit_cate_intercept, discrete_treatment=is_discrete, diff --git a/econml/tests/test_treatment_featurization.py b/econml/tests/test_treatment_featurization.py index 834da1ffe..a5b825253 100644 --- a/econml/tests/test_treatment_featurization.py +++ b/econml/tests/test_treatment_featurization.py @@ -4,7 +4,7 @@ import unittest import numpy as np from sklearn.preprocessing import PolynomialFeatures -from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegression from sklearn.ensemble import RandomForestRegressor from joblib import Parallel, delayed @@ -14,7 +14,7 @@ from econml.iv.dr import DRIV, LinearDRIV, SparseLinearDRIV, ForestDRIV from econml.orf import DMLOrthoForest from sklearn.preprocessing import OneHotEncoder, FunctionTransformer -from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression +from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression, WeightedLassoCVWrapper from econml.utilities import jacify_featurizer from econml.iv.sieve import DPolynomialFeatures @@ -200,6 +200,25 @@ def sum_squeeze_func_transform(x): class TestTreatmentFeaturization(unittest.TestCase): def test_featurization(self): + # use LassoCV rather than also selecting over RandomForests to save time + dml_models = { + "model_t": WeightedLassoCVWrapper(), + "model_y": WeightedLassoCVWrapper() + } + + dmliv_models = { + "model_y_xw": WeightedLassoCVWrapper(), + "model_t_xw": WeightedLassoCVWrapper(), + "model_t_xwz": WeightedLassoCVWrapper(), + } + + driv_models = { + "model_y_xw": WeightedLassoCVWrapper(), + "model_t_xw": WeightedLassoCVWrapper(), + "model_z_xw": WeightedLassoCVWrapper(), + "model_tz_xw": WeightedLassoCVWrapper(), + } + identity_config = { 'DGP_params': { 'n': 2000, @@ -223,10 +242,10 @@ def test_featurization(self): 'squeeze_Ts': [False, True], 'squeeze_Ys': [False, True], 'est_dicts': [ - {'class': LinearDML, 'init_args': {}}, - {'class': CausalForestDML, 'init_args': {}}, - {'class': SparseLinearDML, 'init_args': {}}, - {'class': KernelDML, 'init_args': {}}, + {'class': LinearDML, 'init_args': dml_models}, + {'class': CausalForestDML, 'init_args': dml_models}, + {'class': SparseLinearDML, 'init_args': dml_models}, + {'class': KernelDML, 'init_args': dml_models}, ] } @@ -253,10 +272,10 @@ def test_featurization(self): 'squeeze_Ts': [False, True], 'squeeze_Ys': [False, True], 'est_dicts': [ - {'class': LinearDML, 'init_args': {}}, - {'class': CausalForestDML, 'init_args': {}}, - {'class': SparseLinearDML, 'init_args': {}}, - {'class': KernelDML, 'init_args': {}}, + {'class': LinearDML, 'init_args': dml_models}, + {'class': CausalForestDML, 'init_args': dml_models}, + {'class': SparseLinearDML, 'init_args': dml_models}, + {'class': KernelDML, 'init_args': dml_models}, ] } @@ -268,9 +287,11 @@ def test_featurization(self): poly_IV_config['DGP_params']['d_z'] = 1 poly_IV_config['DGP_params']['nuisance_TZ'] = lambda Z: Z poly_IV_config['est_dicts'] = [ - {'class': OrthoIV, 'init_args': { - 'model_t_xwz': RandomForestRegressor(random_state=1), 'projection': True}}, - {'class': DMLIV, 'init_args': {'model_t_xwz': RandomForestRegressor(random_state=1)}}, + {'class': OrthoIV, 'init_args': {**dmliv_models, + 'model_t_xwz': RandomForestRegressor(random_state=1), + 'projection': True}}, + {'class': DMLIV, 'init_args': {**dmliv_models, + 'model_t_xwz': RandomForestRegressor(random_state=1)}}, ] poly_1d_config = deepcopy(poly_config) @@ -287,11 +308,13 @@ def test_featurization(self): poly_1d_IV_config['treatment_featurizer'] = polynomial_1d_treatment_featurizer poly_1d_IV_config['actual_cme'] = poly_1d_actual_cme poly_1d_IV_config['est_dicts'] = [ - {'class': NonParamDMLIV, 'init_args': {'model_final': StatsModelsLinearRegression()}}, - {'class': DRIV, 'init_args': {'fit_cate_intercept': True}}, - {'class': LinearDRIV, 'init_args': {}}, - {'class': SparseLinearDRIV, 'init_args': {}}, - {'class': ForestDRIV, 'init_args': {}}, + {'class': NonParamDMLIV, 'init_args': {**dmliv_models, + 'model_final': StatsModelsLinearRegression()}}, + {'class': DRIV, 'init_args': {**driv_models, + 'fit_cate_intercept': True}}, + {'class': LinearDRIV, 'init_args': driv_models}, + {'class': SparseLinearDRIV, 'init_args': driv_models}, + {'class': ForestDRIV, 'init_args': driv_models}, ] sum_IV_config = { @@ -319,11 +342,13 @@ def test_featurization(self): 'squeeze_Ts': [False], 'squeeze_Ys': [False, True], 'est_dicts': [ - {'class': NonParamDMLIV, 'init_args': {'model_final': StatsModelsLinearRegression()}}, - {'class': DRIV, 'init_args': {'fit_cate_intercept': True}}, - {'class': LinearDRIV, 'init_args': {}}, - {'class': SparseLinearDRIV, 'init_args': {}}, - {'class': ForestDRIV, 'init_args': {}}, + {'class': NonParamDMLIV, 'init_args': {**dmliv_models, + 'model_final': StatsModelsLinearRegression()}}, + {'class': DRIV, 'init_args': {**driv_models, + 'fit_cate_intercept': True}}, + {'class': LinearDRIV, 'init_args': driv_models}, + {'class': SparseLinearDRIV, 'init_args': driv_models}, + {'class': ForestDRIV, 'init_args': driv_models}, ] } From ba7de62c9df204ef557e7fc6283d7db993226fdc Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Fri, 10 Nov 2023 17:05:29 -0500 Subject: [PATCH 14/19] Make tests more reliable Signed-off-by: Keith Battocchi --- econml/dml/_rlearner.py | 4 ++-- econml/tests/test_dml.py | 4 ++-- econml/tests/test_driv.py | 2 +- econml/tests/test_drlearner.py | 1 + 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/econml/dml/_rlearner.py b/econml/dml/_rlearner.py index 159763a6c..bcde54fc9 100644 --- a/econml/dml/_rlearner.py +++ b/econml/dml/_rlearner.py @@ -222,9 +222,9 @@ def predict(self, X): return self.model.predict(X) class RLearner(_RLearner): def _gen_model_y(self): - return get_selector(ModelFirst(LinearRegression())) + return get_selector(ModelFirst(LinearRegression()), is_discrete=False) def _gen_model_t(self): - return get_selector(ModelFirst(LinearRegression())) + return get_selector(ModelFirst(LinearRegression()), is_discrete=False) def _gen_rlearner_model_final(self): return ModelFinal() np.random.seed(123) diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index 5867f4d40..167c29632 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -1012,9 +1012,9 @@ def prediction_stderr(self, X): assert dml.marginal_effect_interval(1) == (1, 1) def test_sparse(self): + # Ensure reproducibility + np.random.seed(1234) for _ in range(5): - # Ensure reproducibility - np.random.seed(1234) n_p = np.random.randint(2, 5) # 2 to 4 products d_w = np.random.randint(0, 5) # random number of covariates min_n = np.ceil(2 + d_w * (1 + (d_w + 1) / n_p)) # minimum number of rows per product diff --git a/econml/tests/test_driv.py b/econml/tests/test_driv.py index 2b2eba26d..111ecd608 100644 --- a/econml/tests/test_driv.py +++ b/econml/tests/test_driv.py @@ -313,7 +313,7 @@ def test_fit_cov_directly(self): # set the models so that model selection over random forests doesn't take too much time in the repeated trials est = LinearDRIV(model_y_xw=LinearRegression(), model_t_xw=LinearRegression(), model_z_xw=LinearRegression(), - model_tz_xw=LinearRegression()) + model_tz_xw=LassoCV()) n = 500 p = 10 diff --git a/econml/tests/test_drlearner.py b/econml/tests/test_drlearner.py index 3d3e982a9..a02aea617 100644 --- a/econml/tests/test_drlearner.py +++ b/econml/tests/test_drlearner.py @@ -765,6 +765,7 @@ def test_DRLearner(self): outcome_model = Pipeline( [('poly', PolynomialFeatures()), ('model', LinearRegression())]) DR_learner = DRLearner(model_regression=outcome_model, + model_propensity=LogisticRegressionCV(), model_final=LinearRegression()) self._test_te(DR_learner, tol=0.5, te_type="heterogeneous") # Test heterogenous treatment effect for W =/= None From a551c19d728c860e9ba9ee5e299cdd0c3f8ea326 Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Fri, 10 Nov 2023 19:00:32 -0500 Subject: [PATCH 15/19] Try to fix tests Signed-off-by: Keith Battocchi --- econml/tests/test_dml.py | 13 ++++++------- econml/tests/test_driv.py | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index 167c29632..656596fca 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -1014,13 +1014,12 @@ def prediction_stderr(self, X): def test_sparse(self): # Ensure reproducibility np.random.seed(1234) - for _ in range(5): - n_p = np.random.randint(2, 5) # 2 to 4 products - d_w = np.random.randint(0, 5) # random number of covariates - min_n = np.ceil(2 + d_w * (1 + (d_w + 1) / n_p)) # minimum number of rows per product - n_r = np.random.randint(min_n, min_n + 3) - with self.subTest(n_p=n_p, d_w=d_w, n_r=n_r): - TestDML._test_sparse(n_p, d_w, n_r) + n_p = np.random.randint(2, 5) # 2 to 4 products + d_w = np.random.randint(0, 5) # random number of covariates + min_n = np.ceil(2 + d_w * (1 + (d_w + 1) / n_p)) # minimum number of rows per product + n_r = np.random.randint(min_n, min_n + 3) + with self.subTest(n_p=n_p, d_w=d_w, n_r=n_r): + TestDML._test_sparse(n_p, d_w, n_r) def test_linear_sparse(self): """SparseDML test with a sparse DGP""" diff --git a/econml/tests/test_driv.py b/econml/tests/test_driv.py index 111ecd608..6863006f1 100644 --- a/econml/tests/test_driv.py +++ b/econml/tests/test_driv.py @@ -236,7 +236,7 @@ def test_cate_api_without_ray(self): self._test_cate_api(use_ray=False) def _test_accuracy(self, use_ray=False): - np.random.seed(42) + np.random.seed(123) # dgp (binary T, binary Z) From 96cb47e33cf50c792c825b2727b350d3d12c6f9d Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Fri, 10 Nov 2023 21:51:29 -0500 Subject: [PATCH 16/19] Fix tests Signed-off-by: Keith Battocchi --- econml/dml/_rlearner.py | 18 +++++++++++------- econml/tests/test_dml.py | 2 +- econml/tests/test_driv.py | 2 +- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/econml/dml/_rlearner.py b/econml/dml/_rlearner.py index bcde54fc9..c1db38dab 100644 --- a/econml/dml/_rlearner.py +++ b/econml/dml/_rlearner.py @@ -203,16 +203,20 @@ class _RLearner(_OrthoLearner): import numpy as np from sklearn.linear_model import LinearRegression from econml.dml._rlearner import _RLearner - from econml.sklearn_extensions.model_selection import get_selector + from econml.sklearn_extensions.model_selection import SingleModelSelector from sklearn.base import clone - class ModelFirst: + class ModelSelector(SingleModelSelector): def __init__(self, model): self._model = clone(model, safe=False) - def fit(self, X, W, Y, sample_weight=None): + def train(self, is_selecting, X, W, Y, sample_weight=None): self._model.fit(np.hstack([X, W]), Y) return self - def predict(self, X, W): - return self._model.predict(np.hstack([X, W])) + @property + def best_model(self): + return self._model + @property + def best_score(self): + return 0 class ModelFinal: def fit(self, X, T, T_res, Y_res, sample_weight=None, freq_weight=None, sample_var=None): self.model = LinearRegression(fit_intercept=False).fit(X * T_res.reshape(-1, 1), @@ -222,9 +226,9 @@ def predict(self, X): return self.model.predict(X) class RLearner(_RLearner): def _gen_model_y(self): - return get_selector(ModelFirst(LinearRegression()), is_discrete=False) + return ModelSelector(LinearRegression()) def _gen_model_t(self): - return get_selector(ModelFirst(LinearRegression()), is_discrete=False) + return ModelSelector(LinearRegression()) def _gen_rlearner_model_final(self): return ModelFinal() np.random.seed(123) diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index 656596fca..57f6e9051 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -1013,7 +1013,7 @@ def prediction_stderr(self, X): def test_sparse(self): # Ensure reproducibility - np.random.seed(1234) + np.random.seed(123) n_p = np.random.randint(2, 5) # 2 to 4 products d_w = np.random.randint(0, 5) # random number of covariates min_n = np.ceil(2 + d_w * (1 + (d_w + 1) / n_p)) # minimum number of rows per product diff --git a/econml/tests/test_driv.py b/econml/tests/test_driv.py index 6863006f1..9ff237c47 100644 --- a/econml/tests/test_driv.py +++ b/econml/tests/test_driv.py @@ -236,7 +236,7 @@ def test_cate_api_without_ray(self): self._test_cate_api(use_ray=False) def _test_accuracy(self, use_ray=False): - np.random.seed(123) + np.random.seed(0) # dgp (binary T, binary Z) From f454f24b0b1ed1f876d4a00621bf85f7298172ce Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Sat, 11 Nov 2023 05:55:40 -0500 Subject: [PATCH 17/19] Fix docstrings Signed-off-by: Keith Battocchi --- econml/dml/_rlearner.py | 16 ++++++--- econml/dml/causal_forest.py | 6 ++-- econml/dml/dml.py | 52 ++++++++++++++-------------- econml/dr/_drlearner.py | 69 ++++++++++++++++++------------------- econml/iv/dml/_dml.py | 18 +++++----- econml/iv/dr/_dr.py | 52 ++++++++++++++-------------- econml/panel/dml/_dml.py | 46 ++++++++++++------------- 7 files changed, 133 insertions(+), 126 deletions(-) diff --git a/econml/dml/_rlearner.py b/econml/dml/_rlearner.py index c1db38dab..99eb347a7 100644 --- a/econml/dml/_rlearner.py +++ b/econml/dml/_rlearner.py @@ -205,12 +205,20 @@ class _RLearner(_OrthoLearner): from econml.dml._rlearner import _RLearner from econml.sklearn_extensions.model_selection import SingleModelSelector from sklearn.base import clone - class ModelSelector(SingleModelSelector): + class ModelFirst: def __init__(self, model): self._model = clone(model, safe=False) - def train(self, is_selecting, X, W, Y, sample_weight=None): + def fit(self, X, W, Y, sample_weight=None): self._model.fit(np.hstack([X, W]), Y) return self + def predict(self, X, W): + return self._model.predict(np.hstack([X, W])) + class ModelSelector(SingleModelSelector): + def __init__(self, model): + self._model = ModelFirst(model) + def train(self, is_selecting, X, W, Y, sample_weight=None): + self._model.fit(np.hstack(X, W, Y) + return self @property def best_model(self): return self._model @@ -250,9 +258,9 @@ def _gen_rlearner_model_final(self): array([0.999631...]) >>> est.score_ 9.82623204...e-05 - >>> [mdl._model for mdls in est.models_y for mdl in mdls] + >>> [mdl.best_model._model for mdls in est.models_y for mdl in mdls] [LinearRegression(), LinearRegression()] - >>> [mdl._model for mdls in est.models_t for mdl in mdls] + >>> [mdl.best_model._model for mdls in est.models_t for mdl in mdls] [LinearRegression(), LinearRegression()] Attributes diff --git a/econml/dml/causal_forest.py b/econml/dml/causal_forest.py index 757b498ef..8b4555974 100644 --- a/econml/dml/causal_forest.py +++ b/econml/dml/causal_forest.py @@ -548,10 +548,10 @@ class CausalForestDML(_BaseDML): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.76625..., 1.52176..., 0.73679...]) + array([0.88518..., 1.25061..., 0.81112...]) >>> est.effect_interval(X[:3]) - (array([0.39668..., 1.08245... , 0.16566...]), - array([1.13581..., 1.96107..., 1.30791...])) + (array([0.40163..., 0.75023..., 0.46629...]), + array([1.36873..., 1.75099.., 1.15596...])) Attributes ---------- diff --git a/econml/dml/dml.py b/econml/dml/dml.py index e32b6685d..4852cb83f 100644 --- a/econml/dml/dml.py +++ b/econml/dml/dml.py @@ -467,18 +467,19 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.63382..., 1.78225..., 0.71859...]) + array([0.65142..., 1.82917..., 0.79287...]) >>> est.effect_interval(X[:3]) - (array([0.27937..., 1.27619..., 0.42091...]),...([0.98827... , 2.28831..., 1.01628...])) + (array([0.28936..., 1.31239..., 0.47626...]), + array([1.01348..., 2.34594..., 1.10949...])) >>> est.coef_ - array([ 0.42857..., 0.04488..., -0.03317..., 0.02258..., -0.14875...]) + array([ 0.32570..., -0.05311..., -0.03973..., 0.01598..., -0.11045...]) >>> est.coef__interval() - (array([ 0.25179..., -0.10558..., -0.16723... , -0.11916..., -0.28759...]), - array([ 0.60535..., 0.19536..., 0.10088..., 0.16434..., -0.00990...])) + (array([ 0.13791..., -0.20081..., -0.17941..., -0.12073..., -0.25769...]), + array([0.51348..., 0.09458..., 0.09993..., 0.15269..., 0.03679...])) >>> est.intercept_ - 1.01166... + 1.02940... >>> est.intercept__interval() - (0.87125..., 1.15207...) + (0.88754..., 1.17125...) """ def __init__(self, *, @@ -699,20 +700,19 @@ class LinearDML(StatsModelsCateEstimatorMixin, DML): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.59252... , 1.74657..., 0.77384...]) + array([0.60257..., 1.74564..., 0.72062...]) >>> est.effect_interval(X[:3]) - (array([0.25503..., 1.24556..., 0.48440...]), - array([0.93002... , 2.24757..., 1.06328... ])) + (array([0.25760..., 1.24005..., 0.41770...]), + array([0.94754..., 2.25123..., 1.02354...])) >>> est.coef_ - array([ 0.39746..., -0.00313..., 0.01346..., 0.01402..., -0.09071...]) + array([ 0.41635..., 0.00287..., -0.01831..., -0.01197..., -0.11620...]) >>> est.coef__interval() - (array([ 0.23709..., -0.13618... , -0.11712..., -0.11954..., -0.22782...]), - array([0.55783..., 0.12991..., 0.14405..., 0.14758..., 0.04640...])) + (array([ 0.24496..., -0.13418..., -0.14852..., -0.13947..., -0.25089...]), + array([0.58775..., 0.13993..., 0.11189..., 0.11551..., 0.01848...])) >>> est.intercept_ - 0.99197... + 0.97162... >>> est.intercept__interval() - (0.85855..., 1.12539...) - + (0.83640..., 1.10684...) """ def __init__(self, *, @@ -955,19 +955,19 @@ class SparseLinearDML(DebiasedLassoCateEstimatorMixin, DML): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.59401..., 1.74717..., 0.77105...]) + array([0.59812..., 1.75138..., 0.71770...]) >>> est.effect_interval(X[:3]) - (array([0.26608..., 1.26369..., 0.48690...]), - array([0.92195..., 2.23066..., 1.05520...])) + (array([0.25046..., 1.24249..., 0.42606...]), + array([0.94577..., 2.26028..., 1.00935... ])) >>> est.coef_ - array([ 0.39857..., -0.00101... , 0.01112..., 0.01457..., -0.09117...]) + array([ 0.41820..., 0.00506..., -0.01831..., -0.00778..., -0.11965...]) >>> est.coef__interval() - (array([ 0.24285..., -0.13728..., -0.12351..., -0.11585..., -0.22974...]), - array([0.55430..., 0.13526..., 0.14576..., 0.14501... , 0.04738...])) + (array([ 0.25058..., -0.13713..., -0.15469..., -0.13932..., -0.26252...]), + array([0.58583..., 0.14726..., 0.11806..., 0.12376..., 0.02320...])) >>> est.intercept_ - 0.99378... + 0.97131... >>> est.intercept__interval() - (0.86045..., 1.12711...) + (0.83363..., 1.10899...) """ def __init__(self, *, @@ -1204,7 +1204,7 @@ class KernelDML(DML): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.59341..., 1.54740..., 0.69454... ]) + array([0.64124..., 1.46561..., 0.68568... ]) """ def __init__(self, model_y='auto', model_t='auto', @@ -1412,7 +1412,7 @@ class NonParamDML(_BaseDML): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.35318..., 1.28760..., 0.83506...]) + array([0.32389..., 0.85703..., 0.97468...]) """ def __init__(self, *, diff --git a/econml/dr/_drlearner.py b/econml/dr/_drlearner.py index 1f74890e0..4a026ff4f 100644 --- a/econml/dr/_drlearner.py +++ b/econml/dr/_drlearner.py @@ -340,30 +340,29 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc est.fit(y, T, X=X, W=None) >>> est.const_marginal_effect(X[:2]) - array([[0.511640..., 1.144004...], - [0.378140..., 0.613143...]]) + array([[0.520977..., 1.244073...], + [0.365645..., 0.749762...]]) >>> est.effect(X[:2], T0=0, T1=1) - array([0.511640..., 0.378140...]) + array([0.520977..., 0.365645...]) >>> est.score_ - 5.11238581... + 3.15958089... >>> est.score(y, T, X=X) - 5.78673506... + 2.60965712... >>> est.model_cate(T=1).coef_ - array([0.434910..., 0.010226..., 0.047913...]) + array([0.369069..., 0.016610..., 0.019072...]) >>> est.model_cate(T=2).coef_ - array([ 0.863723..., 0.086946..., -0.022288...]) + array([ 0.768336..., 0.082106..., -0.030475...]) >>> est.cate_feature_names() ['X0', 'X1', 'X2'] >>> [mdl.coef_ for mdls in est.models_regression for mdl in mdls] - [array([ 1.472..., 0.001..., -0.011..., 0.698..., 2.049...]), - array([ 1.455..., -0.002..., 0.005..., 0.677..., 1.998...])] + [array([ 1.463..., 0.006..., -0.006..., 0.726..., 2.029...]), + array([ 1.466..., -0.002..., 0..., 0.646..., 2.014...])] >>> [mdl.coef_ for mdls in est.models_propensity for mdl in mdls] - [array([[-0.747..., 0.153..., -0.018...], - [ 0.083..., -0.110..., -0.076...], - [ 0.663..., -0.043... , 0.094...]]), - array([[-1.048..., 0.000..., 0.032...], - [ 0.019..., 0.124..., -0.081...], - [ 1.029..., -0.124..., 0.049...]])] + [array([[-0.67903093, 0.04261741, -0.05969718], + [ 0.034..., -0.013..., -0.013...], + [ 0.644..., -0.028..., 0.073...]]), array([[-0.831..., 0.100..., 0.090...], + [ 0.084..., 0.013..., -0.154...], + [ 0.747..., -0.113..., 0.063...]])] Beyond default models: @@ -385,19 +384,19 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc est.fit(y, T, X=X, W=None) >>> est.score_ - 1.7... + 3.7... >>> est.const_marginal_effect(X[:3]) - array([[0.68..., 1.10...], - [0.56..., 0.79...], - [0.34..., 0.10...]]) + array([[0.64..., 1.23...], + [0.49..., 0.92...], + [0.20..., 0.26...]]) >>> est.model_cate(T=2).coef_ - array([0.74..., 0. , 0. ]) + array([0.72..., 0. , 0. ]) >>> est.model_cate(T=2).intercept_ - 1.9... + 2.0... >>> est.model_cate(T=1).coef_ - array([0.24..., 0.00..., 0. ]) + array([0.31..., 0.01..., 0.00...]) >>> est.model_cate(T=1).intercept_ - 0.94... + 0.97... Attributes ---------- @@ -865,17 +864,17 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([ 0.409743..., 0.312604..., -0.127394...]) + array([0.457602..., 0.335707..., 0.011288...]) >>> est.effect_interval(X[:3]) - (array([ 0.065306..., -0.182074..., -0.765901...]), array([0.754180..., 0.807284..., 0.511113...])) + (array([ 0.164623..., -0.098980..., -0.493464...]), array([0.750582..., 0.77039... , 0.516041...])) >>> est.coef_(T=1) - array([ 0.450779..., -0.003214... , 0.063884... ]) + array([ 0.338061..., 0.025654..., 0.044389...]) >>> est.coef__interval(T=1) - (array([ 0.155111..., -0.246272..., -0.136827...]), array([0.746447..., 0.239844..., 0.264595...])) + (array([ 0.135677..., -0.155845..., -0.143376...]), array([0.540446..., 0.207155..., 0.232155...])) >>> est.intercept_(T=1) - 0.88425066... + 0.78646497... >>> est.intercept__interval(T=1) - (0.64868548..., 1.11981585...) + (0.60344468..., 0.96948526...) Attributes ---------- @@ -1158,17 +1157,17 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([ 0.41..., 0.31..., -0.12...]) + array([0.45..., 0.33..., 0.01...]) >>> est.effect_interval(X[:3]) - (array([-0.02..., -0.29... , -0.84...]), array([0.84..., 0.92..., 0.59...])) + (array([ 0.11..., -0.13..., -0.54...]), array([0.79..., 0.80..., 0.57...])) >>> est.coef_(T=1) - array([ 0.45..., -0.00..., 0.06...]) + array([0.33..., 0.02..., 0.04...]) >>> est.coef__interval(T=1) - (array([ 0.20..., -0.23..., -0.17...]), array([0.69..., 0.23..., 0.30...])) + (array([ 0.14..., -0.15..., -0.14...]), array([0.53..., 0.20..., 0.23...])) >>> est.intercept_(T=1) - 0.88... + 0.78... >>> est.intercept__interval(T=1) - (0.64..., 1.11...) + (0.60..., 0.96...) Attributes ---------- diff --git a/econml/iv/dml/_dml.py b/econml/iv/dml/_dml.py index 1cddcc247..649293cde 100644 --- a/econml/iv/dml/_dml.py +++ b/econml/iv/dml/_dml.py @@ -331,19 +331,19 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-4.57086..., 6.06523..., -3.02513...]) + array([-4.49594..., 5.79852..., -2.88049...]) >>> est.effect_interval(X[:3]) - (array([-7.45472..., 1.85334..., -5.47322...]), - array([-1.68700... , 10.27712..., -0.57704...])) + (array([-7.40954..., 1.47475..., -5.32889...]), + array([-1.58235..., 10.12229..., -0.43209...])) >>> est.coef_ - array([ 5.11260... , 0.71353..., 0.38242..., -0.23891..., -0.07036...]) + array([ 5.27614..., 0.92092..., 0.57579..., -0.22810..., -0.16952...]) >>> est.coef__interval() - (array([ 3.76773..., -0.42532..., -0.78145..., -1.36996..., -1.22505...]), - array([6.45747..., 1.85239..., 1.54631..., 0.89213..., 1.08432...])) + (array([ 3.93362..., -0.22159..., -0.59863..., -1.39139..., -1.34549...]), + array([6.61866..., 2.06345..., 1.75022..., 0.93518..., 1.00644...])) >>> est.intercept_ - -0.24090... + -0.29110... >>> est.intercept__interval() - (-1.39053..., 0.90872...) + (-1.45607..., 0.87386...) """ def __init__(self, *, @@ -1492,7 +1492,7 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-5.52240..., 7.86930..., -3.57966...]) + array([-6.18157..., 8.70189..., -4.06004...]) """ diff --git a/econml/iv/dr/_dr.py b/econml/iv/dr/_dr.py index 8e48f905e..c2393cabf 100644 --- a/econml/iv/dr/_dr.py +++ b/econml/iv/dr/_dr.py @@ -883,7 +883,7 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-1.71678..., -0.27824..., -3.18333...]) + array([-4.07330..., 6.01693..., -2.71813...]) """ def __init__(self, *, @@ -1364,19 +1364,19 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-0.54223..., 0.77763..., -2.01011...]) + array([-4.29809..., 5.94280..., -3.00977...]) >>> est.effect_interval(X[:3]) - (array([-4.73213..., -5.57270..., -5.84891...]), - array([3.64765..., 7.12797..., 1.82868...])) + (array([-7.09165..., 1.79692..., -5.46033...]), + array([-1.50452..., 10.08868..., -0.55922...])) >>> est.coef_ - array([ 3.12341..., 1.78962..., -0.45351..., -0.41677..., 0.93306...]) + array([ 4.84900..., 0.82084..., 0.24269..., -0.04771..., -0.29325...]) >>> est.coef__interval() - (array([ 1.36498..., 0.00496..., -2.28573..., -2.02274..., -0.94000...]), - array([4.88184..., 3.57428..., 1.37869..., 1.18919..., 2.80614...])) + (array([ 3.67882..., -0.35547..., -0.97063..., -1.15410..., -1.50482...]), + array([6.01917..., 1.99716..., 1.45603..., 1.05867..., 0.91831...])) >>> est.intercept_ - 1.10417... + -0.16276... >>> est.intercept__interval() - (-0.65690..., 2.86525...) + (-1.32713..., 1.00160...) """ def __init__(self, *, @@ -1715,19 +1715,19 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-0.68659..., 1.03696..., -2.10343...]) + array([-4.26791..., 5.98882..., -3.02154...]) >>> est.effect_interval(X[:3]) - (array([-4.92102..., -4.99359..., -5.79899...]), - array([3.54783..., 7.06753..., 1.59212...])) + (array([-7.06828..., 2.00060..., -5.46554...]), + array([-1.46754..., 9.97704..., -0.57754...])) >>> est.coef_ - array([ 3.18552..., 1.83651..., -0.47721..., -0.28640... , 0.87765...]) + array([ 4.84189..., 0.81844... , 0.20681..., -0.04660..., -0.28790...]) >>> est.coef__interval() - (array([ 1.43299..., 0.06316..., -2.28671..., -2.01185..., -0.93582...]), - array([4.93805..., 3.60987..., 1.33227... , 1.43904..., 2.69114...])) + (array([ 3.68288..., -0.35434..., -0.98986..., -1.18770..., -1.48722...]), + array([6.00090..., 1.99122..., 1.40349..., 1.09449..., 0.91141...])) >>> est.intercept_ - 1.15151... + -0.12298... >>> est.intercept__interval() - (-0.60109..., 2.90411...) + (-1.28204..., 1.03607...) """ def __init__(self, *, @@ -2627,7 +2627,7 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-4.29282..., 6.08590..., -2.11608...]) + array([-3.71724..., 6.39915..., -2.14545...]) """ def __init__(self, *, @@ -2921,19 +2921,19 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-4.81123..., 5.65430..., -2.63204...]) + array([-4.05294..., 6.44603..., -2.49535...]) >>> est.effect_interval(X[:3]) - (array([-8.42669..., 0.36538... , -5.82840...]), - array([-1.19578... , 10.94323..., 0.56430...])) + (array([-8.42902..., 0.05595..., -6.34202...]), + array([ 0.32313..., 12.83612..., 1.35131...])) >>> est.coef_ - array([ 5.01936..., 0.71988..., 0.82603..., -0.08192... , -0.02520...]) + array([ 4.99132..., 0.35043..., 0.41963..., -0.63553..., -0.33972...]) >>> est.coef__interval() - (array([ 3.52057... , -0.72550..., -0.72653..., -1.50040... , -1.52896...]), - array([6.51816..., 2.16527..., 2.37861..., 1.33656..., 1.47854...])) + (array([ 3.11828..., -1.44768..., -1.46377..., -2.36080..., -2.18746...]), + array([6.86435..., 2.14856..., 2.30303..., 1.08973..., 1.50802...])) >>> est.intercept_ - -0.45176... + -0.25633... >>> est.intercept__interval() - (-1.93313..., 1.02959...) + (-2.07961..., 1.56695...) """ def __init__(self, *, diff --git a/econml/panel/dml/_dml.py b/econml/panel/dml/_dml.py index 97190639b..a9de0a4a1 100644 --- a/econml/panel/dml/_dml.py +++ b/econml/panel/dml/_dml.py @@ -434,33 +434,33 @@ class DynamicDML(LinearModelFinalCateEstimatorMixin, _OrthoLearner): est.fit(y, T, X=X, W=None, groups=groups, inference="auto") >>> est.const_marginal_effect(X[:2]) - array([[-0.336..., -0.048..., -0.061..., 0.042..., -0.204..., - 0.00667271], - [-0.101..., 0.433..., 0.054..., -0.217..., -0.101..., - -0.159...]]) + array([[-0.345..., -0.056..., -0.044..., 0.046..., -0.202..., + 0.023...], + [-0.120..., 0.434..., 0.052..., -0.201..., -0.115..., + -0.134...]]) >>> est.effect(X[:2], T0=0, T1=1) - array([-0.601..., -0.091...]) + array([-0.579..., -0.085...]) >>> est.effect(X[:2], T0=np.zeros((2, n_periods*T.shape[1])), T1=np.ones((2, n_periods*T.shape[1]))) - array([-0.601..., -0.091...]) + array([-0.579, -0.085...]) >>> est.coef_ - array([[ 0.112...], - [ 0.231...], - [ 0.055...], - [-0.125...], - [ 0.049...], - [-0.079...]]) + array([[ 0.108...], + [ 0.235...], + [ 0.046...], + [-0.119...], + [ 0.042...], + [-0.075...]]) >>> est.coef__interval() - (array([[-0.063...], - [-0.009...], - [-0.114...], - [-0.413...], - [-0.117...], - [-0.262...]]), array([[0.289...], - [0.471...], - [0.225...], - [0.163...], - [0.216...], - [0.103...]])) + (array([[-0.042...], + [-0.001...], + [-0.120...], + [-0.393...], + [-0.120...], + [-0.256...]]), array([[0.258...], + [0.473...], + [0.212...], + [0.154...], + [0.204...], + [0.104...]])) """ def __init__(self, *, From 9b456019c468c7f4e8ed9cba079e69fe7e0a83ca Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Sat, 11 Nov 2023 10:27:17 -0500 Subject: [PATCH 18/19] Fix doctests Signed-off-by: Keith Battocchi --- econml/dml/_rlearner.py | 2 +- econml/dml/causal_forest.py | 2 +- econml/dml/dml.py | 4 +-- econml/dr/_drlearner.py | 2 +- econml/iv/dml/_dml.py | 6 ++-- econml/iv/dr/_dr.py | 6 ++-- econml/panel/dml/_dml.py | 2 +- notebooks/Scaling EconML using Ray.ipynb | 36 +++++++++++------------- 8 files changed, 28 insertions(+), 32 deletions(-) diff --git a/econml/dml/_rlearner.py b/econml/dml/_rlearner.py index 99eb347a7..adec1fc3e 100644 --- a/econml/dml/_rlearner.py +++ b/econml/dml/_rlearner.py @@ -217,7 +217,7 @@ class ModelSelector(SingleModelSelector): def __init__(self, model): self._model = ModelFirst(model) def train(self, is_selecting, X, W, Y, sample_weight=None): - self._model.fit(np.hstack(X, W, Y) + self._model.fit(X, W, Y, sample_weight=sample_weight) return self @property def best_model(self): diff --git a/econml/dml/causal_forest.py b/econml/dml/causal_forest.py index 8b4555974..2672b3c8a 100644 --- a/econml/dml/causal_forest.py +++ b/econml/dml/causal_forest.py @@ -551,7 +551,7 @@ class CausalForestDML(_BaseDML): array([0.88518..., 1.25061..., 0.81112...]) >>> est.effect_interval(X[:3]) (array([0.40163..., 0.75023..., 0.46629...]), - array([1.36873..., 1.75099.., 1.15596...])) + array([1.36873..., 1.75099..., 1.15596...])) Attributes ---------- diff --git a/econml/dml/dml.py b/econml/dml/dml.py index 4852cb83f..be06d9b9e 100644 --- a/econml/dml/dml.py +++ b/econml/dml/dml.py @@ -474,7 +474,7 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn >>> est.coef_ array([ 0.32570..., -0.05311..., -0.03973..., 0.01598..., -0.11045...]) >>> est.coef__interval() - (array([ 0.13791..., -0.20081..., -0.17941..., -0.12073..., -0.25769...]), + (array([ 0.13791..., -0.20081..., -0.17941..., -0.12073..., -0.25769...]), array([0.51348..., 0.09458..., 0.09993..., 0.15269..., 0.03679...])) >>> est.intercept_ 1.02940... @@ -1204,7 +1204,7 @@ class KernelDML(DML): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.64124..., 1.46561..., 0.68568... ]) + array([0.64124..., 1.46561..., 0.68568...]) """ def __init__(self, model_y='auto', model_t='auto', diff --git a/econml/dr/_drlearner.py b/econml/dr/_drlearner.py index 4a026ff4f..fac6da9f7 100644 --- a/econml/dr/_drlearner.py +++ b/econml/dr/_drlearner.py @@ -355,7 +355,7 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc >>> est.cate_feature_names() ['X0', 'X1', 'X2'] >>> [mdl.coef_ for mdls in est.models_regression for mdl in mdls] - [array([ 1.463..., 0.006..., -0.006..., 0.726..., 2.029...]), + [array([ 1.463..., 0.006..., -0.006..., 0.726..., 2.029...]), array([ 1.466..., -0.002..., 0..., 0.646..., 2.014...])] >>> [mdl.coef_ for mdls in est.models_propensity for mdl in mdls] [array([[-0.67903093, 0.04261741, -0.05969718], diff --git a/econml/iv/dml/_dml.py b/econml/iv/dml/_dml.py index 649293cde..b42925ff9 100644 --- a/econml/iv/dml/_dml.py +++ b/econml/iv/dml/_dml.py @@ -1113,11 +1113,11 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-4.47392..., 5.74626..., -3.08471...]) + array([-6.83575..., 9.40666..., -4.27123...]) >>> est.coef_ - array([ 5.00993..., 0.86981..., 0.35110..., -0.11390... , -0.17933...]) + array([ 8.07179..., 1.51080..., 0.87328..., -0.06944..., -0.47404...]) >>> est.intercept_ - -0.27719... + -0.20555... """ diff --git a/econml/iv/dr/_dr.py b/econml/iv/dr/_dr.py index c2393cabf..a72272774 100644 --- a/econml/iv/dr/_dr.py +++ b/econml/iv/dr/_dr.py @@ -2154,10 +2154,10 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-1.74672..., 1.57..., -1.58916...]) + array([-1.60489..., 5.40611..., -3.46904...]) >>> est.effect_interval(X[:3]) - (array([-7.05230..., -6..., -5.11344...]), - array([3.55885..., 9.9..., 1.93512...])) + (array([-5.37171..., 0.73055..., -7.15266...]), + array([ 2.16192..., 10.08168..., 0.21457...])) """ def __init__(self, *, diff --git a/econml/panel/dml/_dml.py b/econml/panel/dml/_dml.py index a9de0a4a1..8fe01b6d8 100644 --- a/econml/panel/dml/_dml.py +++ b/econml/panel/dml/_dml.py @@ -441,7 +441,7 @@ class DynamicDML(LinearModelFinalCateEstimatorMixin, _OrthoLearner): >>> est.effect(X[:2], T0=0, T1=1) array([-0.579..., -0.085...]) >>> est.effect(X[:2], T0=np.zeros((2, n_periods*T.shape[1])), T1=np.ones((2, n_periods*T.shape[1]))) - array([-0.579, -0.085...]) + array([-0.579..., -0.085...]) >>> est.coef_ array([[ 0.108...], [ 0.235...], diff --git a/notebooks/Scaling EconML using Ray.ipynb b/notebooks/Scaling EconML using Ray.ipynb index 49c9a44fa..217bd122a 100644 --- a/notebooks/Scaling EconML using Ray.ipynb +++ b/notebooks/Scaling EconML using Ray.ipynb @@ -35,11 +35,11 @@ "execution_count": 4, "id": "01b70101-d4ad-40fc-baa6-565795ee897a", "metadata": { - "tags": [], "ExecuteTime": { "end_time": "2023-08-16T18:32:09.629351Z", "start_time": "2023-08-16T18:32:09.627091Z" - } + }, + "tags": [] }, "outputs": [], "source": [ @@ -47,9 +47,8 @@ "import os\n", "import numpy as np\n", "import scipy\n", - "from econml.dml import DML\n", + "from econml.dml import LinearDML\n", "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", - "from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] @@ -125,9 +124,10 @@ "outputs": [], "source": [ "np.random.seed(123)\n", - "X = np.random.normal(size=(10000, 5))\n", + "n = 5000\n", + "X = np.random.normal(size=(n, 5))\n", "T = np.random.binomial(1, scipy.special.expit(X[:, 0]))\n", - "y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(10000,))" + "y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(n,))" ] }, { @@ -171,11 +171,9 @@ "\n", "ray_opts = {'num_cpus':2,'scheduling_strategy':'SPREAD'}\n", "\n", - "est = DML(\n", + "est = LinearDML(\n", " model_y=RandomForestRegressor(random_state=0),\n", " model_t=RandomForestClassifier(random_state=0),\n", - " model_final=StatsModelsLinearRegression(fit_intercept=False),\n", - " linear_first_stages=False,\n", " discrete_treatment=True,\n", " use_ray=True, #setting use_ray flag to True to use ray.\n", " ray_remote_func_options=ray_opts,\n", @@ -217,15 +215,13 @@ " runtimes = []\n", " for cv in cv_values:\n", " ray_opts = {'num_cpus': 2, 'scheduling_strategy': 'SPREAD'} if use_ray else None\n", - " est = DML(model_y=RandomForestRegressor(random_state=0),\n", - " model_t=RandomForestClassifier(random_state=0),\n", - " model_final=StatsModelsLinearRegression(fit_intercept=False),\n", - " linear_first_stages=False,\n", - " discrete_treatment=True,\n", - " use_ray=use_ray,\n", - " ray_remote_func_options=ray_opts,\n", - " cv=cv,\n", - " mc_iters=1)\n", + " est = LinearDML(model_y=RandomForestRegressor(random_state=0),\n", + " model_t=RandomForestClassifier(random_state=0),\n", + " discrete_treatment=True,\n", + " use_ray=use_ray,\n", + " ray_remote_func_options=ray_opts,\n", + " cv=cv,\n", + " mc_iters=1)\n", " \n", " start_time = time.time()\n", " est.fit(y, T, X=X, W=None)\n", @@ -296,9 +292,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file From 4618ffa8fbcffc9c7d5ef914ebcd581ff0ca2705 Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Sat, 11 Nov 2023 12:33:16 -0500 Subject: [PATCH 19/19] Fix doctests Signed-off-by: Keith Battocchi --- econml/dml/_rlearner.py | 4 ++-- econml/dr/_drlearner.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/econml/dml/_rlearner.py b/econml/dml/_rlearner.py index adec1fc3e..a020b432d 100644 --- a/econml/dml/_rlearner.py +++ b/econml/dml/_rlearner.py @@ -258,9 +258,9 @@ def _gen_rlearner_model_final(self): array([0.999631...]) >>> est.score_ 9.82623204...e-05 - >>> [mdl.best_model._model for mdls in est.models_y for mdl in mdls] + >>> [mdl._model for mdls in est.models_y for mdl in mdls] [LinearRegression(), LinearRegression()] - >>> [mdl.best_model._model for mdls in est.models_t for mdl in mdls] + >>> [mdl._model for mdls in est.models_t for mdl in mdls] [LinearRegression(), LinearRegression()] Attributes diff --git a/econml/dr/_drlearner.py b/econml/dr/_drlearner.py index fac6da9f7..749eec5d9 100644 --- a/econml/dr/_drlearner.py +++ b/econml/dr/_drlearner.py @@ -868,7 +868,7 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner): >>> est.effect_interval(X[:3]) (array([ 0.164623..., -0.098980..., -0.493464...]), array([0.750582..., 0.77039... , 0.516041...])) >>> est.coef_(T=1) - array([ 0.338061..., 0.025654..., 0.044389...]) + array([0.338061..., 0.025654..., 0.044389...]) >>> est.coef__interval(T=1) (array([ 0.135677..., -0.155845..., -0.143376...]), array([0.540446..., 0.207155..., 0.232155...])) >>> est.intercept_(T=1)