diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py index 15d7b7af3..270fd5d84 100644 --- a/econml/_ortho_learner.py +++ b/econml/_ortho_learner.py @@ -45,6 +45,7 @@ class in this module implements the general logic in a very versatile way from .utilities import (_deprecate_positional, check_input_arrays, cross_product, filter_none_kwargs, inverse_onehot, jacify_featurizer, ndim, reshape, shape, transpose) +from .sklearn_extensions.model_selection import ModelSelector try: import ray @@ -100,7 +101,7 @@ def _fit_fold(model, train_idxs, test_idxs, calculate_scores, args, kwargs): kwargs_train = {key: var[train_idxs] for key, var in kwargs.items()} kwargs_test = {key: var[test_idxs] for key, var in kwargs.items()} - model.fit(*args_train, **kwargs_train) + model.train(False, *args_train, **kwargs_train) nuisance_temp = model.predict(*args_test, **kwargs_test) if not isinstance(nuisance_temp, tuple): @@ -115,17 +116,18 @@ def _fit_fold(model, train_idxs, test_idxs, calculate_scores, args, kwargs): return nuisance_temp, model, test_idxs, (score_temp if calculate_scores else None) -def _crossfit(model, folds, use_ray, ray_remote_fun_option, *args, **kwargs): +def _crossfit(model: ModelSelector, folds, use_ray, ray_remote_fun_option, *args, **kwargs): """ General crossfit based calculation of nuisance parameters. Parameters ---------- - model : object - An object that supports fit and predict. Fit must accept all the args - and the keyword arguments kwargs. Similarly predict must all accept - all the args as arguments and kwards as keyword arguments. The fit - function estimates a model of the nuisance function, based on the input + model : ModelSelector + An object that has train and predict methods. + The train method must take an 'is_selecting' argument first, and then + accept positional arguments `args` and keyword arguments `kwargs`; the predict method + just takes those `args` and `kwargs`. The train + method selects or estimates a model of the nuisance function, based on the input data to fit. Predict evaluates the fitted nuisance function on the input data to predict. folds : list of tuple or None @@ -177,7 +179,7 @@ def _crossfit(model, folds, use_ray, ray_remote_fun_option, *args, **kwargs): class Wrapper: def __init__(self, model): self._model = model - def fit(self, X, y, W=None): + def fit(self, is_selecting, X, y, W=None): self._model.fit(X, y) return self def predict(self, X, y, W=None): @@ -202,13 +204,17 @@ def predict(self, X, y, W=None): """ model_list = [] + + kwargs = filter_none_kwargs(**kwargs) + model.train(True, *args, **kwargs) + calculate_scores = hasattr(model, 'score') # remove None arguments - kwargs = filter_none_kwargs(**kwargs) if folds is None: # skip crossfitting model_list.append(clone(model, safe=False)) - model_list[0].fit(*args, **kwargs) + model_list[0].train(True, *args, **kwargs) + model_list[0].train(False, *args, **kwargs) # fit the selected model nuisances = model_list[0].predict(*args, **kwargs) scores = model_list[0].score(*args, **kwargs) if calculate_scores else None @@ -394,7 +400,7 @@ class ModelNuisance: def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self @@ -448,7 +454,7 @@ class ModelNuisance: def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, np.matmul(T, np.arange(1, T.shape[1]+1))) self._model_y.fit(W, Y) return self @@ -532,15 +538,15 @@ def _gen_allowed_missing_vars(self): @abstractmethod def _gen_ortho_learner_model_nuisance(self): - """ Must return a fresh instance of a nuisance model + """Must return a fresh instance of a nuisance model selector Returns ------- - model_nuisance: estimator - The estimator for fitting the nuisance function. Must implement - `fit` and `predict` methods that both have signatures:: + model_nuisance: selector + The selector for fitting the nuisance function. The returned estimator must implement + `train` and `predict` methods that both have signatures:: - model_nuisance.fit(Y, T, X=X, W=W, Z=Z, + model_nuisance.train(is_selecting, Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight) model_nuisance.predict(Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight) diff --git a/econml/dml/_rlearner.py b/econml/dml/_rlearner.py index bd645fda3..b1bc9e2ad 100644 --- a/econml/dml/_rlearner.py +++ b/econml/dml/_rlearner.py @@ -29,40 +29,35 @@ import numpy as np import copy from warnings import warn + +from ..sklearn_extensions.model_selection import ModelSelector from ..utilities import (shape, reshape, ndim, hstack, filter_none_kwargs, _deprecate_positional) from sklearn.linear_model import LinearRegression from sklearn.base import clone from .._ortho_learner import _OrthoLearner -class _ModelNuisance: +class _ModelNuisance(ModelSelector): """ Nuisance model fits the model_y and model_t at fit time and at predict time calculates the residual Y and residual T based on the fitted models and returns the residuals as two nuisance parameters. """ - def __init__(self, model_y, model_t): + def __init__(self, model_y: ModelSelector, model_t: ModelSelector): self._model_y = model_y self._model_t = model_t - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): assert Z is None, "Cannot accept instrument!" - self._model_t.fit(X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) - self._model_y.fit(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + self._model_t.train(is_selecting, X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + self._model_y.train(is_selecting, X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) return self def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): - if hasattr(self._model_y, 'score'): - # note that groups are not passed to score because they are only used for fitting - Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight)) - else: - Y_score = None - if hasattr(self._model_t, 'score'): - # note that groups are not passed to score because they are only used for fitting - T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight)) - else: - T_score = None + # note that groups are not passed to score because they are only used for fitting + T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight)) + Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight)) return Y_score, T_score def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): @@ -302,7 +297,7 @@ def _gen_model_y(self): """ Returns ------- - model_y: estimator of E[Y | X, W] + model_y: selector for the estimator of E[Y | X, W] The estimator for fitting the response to the features and controls. Must implement `fit` and `predict` methods. Unlike sklearn estimators both methods must take an extra second argument (the controls), i.e. :: @@ -317,7 +312,7 @@ def _gen_model_t(self): """ Returns ------- - model_t: estimator of E[T | X, W] + model_t: selector for the estimator of E[T | X, W] The estimator for fitting the treatment to the features and controls. Must implement `fit` and `predict` methods. Unlike sklearn estimators both methods must take an extra second argument (the controls), i.e. :: @@ -432,11 +427,11 @@ def rlearner_model_final_(self): @property def models_y(self): - return [[mdl._model_y for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_y.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t(self): - return [[mdl._model_t for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def nuisance_scores_y(self): diff --git a/econml/dml/causal_forest.py b/econml/dml/causal_forest.py index 4f038eb3f..757b498ef 100644 --- a/econml/dml/causal_forest.py +++ b/econml/dml/causal_forest.py @@ -11,7 +11,7 @@ from sklearn.model_selection import train_test_split from itertools import product from .dml import _BaseDML -from .dml import _FirstStageWrapper +from .dml import _make_first_stage_selector from ..sklearn_extensions.linear_model import WeightedLassoCVWrapper from ..sklearn_extensions.model_selection import WeightedStratifiedKFold from ..inference import NormalInferenceResults @@ -668,22 +668,10 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y(self): - if self.model_y == 'auto': - model_y = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y = clone(self.model_y, safe=False) - return _FirstStageWrapper(model_y, True, self._gen_featurizer(), False, self.discrete_treatment) + return _make_first_stage_selector(self.model_y, False, self.random_state) def _gen_model_t(self): - if self.model_t == 'auto': - if self.discrete_treatment: - model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t = clone(self.model_t, safe=False) - return _FirstStageWrapper(model_t, False, self._gen_featurizer(), False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t, self.discrete_treatment, self.random_state) def _gen_model_final(self): return MultiOutputGRF(CausalForest(n_estimators=self.n_estimators, diff --git a/econml/dml/dml.py b/econml/dml/dml.py index d7c59013b..caa12e0c2 100644 --- a/econml/dml/dml.py +++ b/econml/dml/dml.py @@ -29,76 +29,85 @@ from ..sklearn_extensions.model_selection import WeightedStratifiedKFold from ..utilities import (_deprecate_positional, add_intercept, broadcast_unit_treatments, check_high_dimensional, - cross_product, deprecated, fit_with_groups, + cross_product, deprecated, hstack, inverse_onehot, ndim, reshape, reshape_treatmentwise_effects, shape, transpose, get_feature_names_or_default, filter_none_kwargs) from .._shap import _shap_explain_model_cate -from ..sklearn_extensions.model_selection import SearchEstimatorList -import pdb +from ..sklearn_extensions.model_selection import get_selector, ModelSelector, SingleModelSelector -class _FirstStageWrapper: - def __init__(self, model, is_Y, featurizer, linear_first_stages, discrete_treatment): - self._model = clone(model, safe=False) - self._featurizer = clone(featurizer, safe=False) - self._is_Y = is_Y - self._linear_first_stages = linear_first_stages - self._discrete_treatment = discrete_treatment - - def _combine(self, X, W, n_samples, fitting=True): - if X is None: - # if both X and W are None, just return a column of ones - return (W if W is not None else np.ones((n_samples, 1))) - XW = hstack([X, W]) if W is not None else X - if self._is_Y and self._linear_first_stages: - if self._featurizer is None: - F = X - else: - F = self._featurizer.fit_transform(X) if fitting else self._featurizer.transform(X) - return cross_product(XW, hstack([np.ones((shape(XW)[0], 1)), F])) - else: - return XW +def _combine(X, W, n_samples): + if X is None: + # if both X and W are None, just return a column of ones + return (W if W is not None else np.ones((n_samples, 1))) + return hstack([X, W]) if W is not None else X - def fit(self, X, W, Target, sample_weight=None, groups=None): - if (not self._is_Y) and self._discrete_treatment: - # In this case, the Target is the one-hot-encoding of the treatment variable - # We need to go back to the label representation of the one-hot so as to call - # the classifier. - if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))): - raise AttributeError("Provided crossfit folds contain training splits that " + - "don't contain all treatments") - Target = inverse_onehot(Target) - if sample_weight is not None: - fit_with_groups(self._model, self._combine(X, W, Target.shape[0]), Target, groups=groups, - sample_weight=sample_weight) - else: - fit_with_groups(self._model, self._combine(X, W, Target.shape[0]), Target, groups=groups) - return self +class _FirstStageWrapper: + def __init__(self, model, discrete_target): + self._model = model # plain sklearn-compatible model, not a ModelSelector + self._discrete_target = discrete_target def predict(self, X, W): n_samples = X.shape[0] if X is not None else (W.shape[0] if W is not None else 1) - if (not self._is_Y) and self._discrete_treatment: - return self._model.predict_proba(self._combine(X, W, n_samples, fitting=False))[:, 1:] + if self._discrete_target: + return self._model.predict_proba(_combine(X, W, n_samples))[:, 1:] else: - return self._model.predict(self._combine(X, W, n_samples, fitting=False)) + return self._model.predict(_combine(X, W, n_samples)) def score(self, X, W, Target, sample_weight=None): if hasattr(self._model, 'score'): - if (not self._is_Y) and self._discrete_treatment: + if self._discrete_target: # In this case, the Target is the one-hot-encoding of the treatment variable # We need to go back to the label representation of the one-hot so as to call # the classifier. Target = inverse_onehot(Target) if sample_weight is not None: - return self._model.score(self._combine(X, W, Target.shape[0]), Target, sample_weight=sample_weight) + return self._model.score(_combine(X, W, Target.shape[0]), Target, sample_weight=sample_weight) else: - return self._model.score(self._combine(X, W, Target.shape[0]), Target) + return self._model.score(_combine(X, W, Target.shape[0]), Target) else: return None +class _FirstStageSelector(SingleModelSelector): + def __init__(self, model: SingleModelSelector, discrete_target): + self._model = clone(model, safe=False) + self._discrete_target = discrete_target + + def train(self, is_selecting, X, W, Target, sample_weight=None, groups=None): + if self._discrete_target: + # In this case, the Target is the one-hot-encoding of the treatment variable + # We need to go back to the label representation of the one-hot so as to call + # the classifier. + if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))): + raise AttributeError("Provided crossfit folds contain training splits that " + + "don't contain all treatments") + Target = inverse_onehot(Target) + + self._model.train(is_selecting, _combine(X, W, Target.shape[0]), Target, + **filter_none_kwargs(groups=groups, sample_weight=sample_weight)) + return self + + @property + def best_model(self): + return _FirstStageWrapper(self._model.best_model, self._discrete_target) + + @property + def best_score(self): + return self._model.best_score + + +def _make_first_stage_selector(model, is_discrete, random_state): + if model == 'auto': + model = ['forest', 'linear'] + return _FirstStageSelector(get_selector(model, + is_discrete=is_discrete, + random_state=random_state), + discrete_target=is_discrete) + + class _FinalWrapper: def __init__(self, model_final, fit_cate_intercept, featurizer, use_weight_trick): self._model = clone(model_final, safe=False) @@ -359,7 +368,7 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn `fit` and `predict` methods, and must be a linear model for correctness. param_list: list or 'auto', default 'auto' - The list of parameters to be used during cross-validation. + The list of parameters to be used during cross-validation. If 'auto', it will be chosen based on the model type. scaling: bool, default True @@ -538,45 +547,11 @@ def _gen_allowed_missing_vars(self): def _gen_featurizer(self): return clone(self.featurizer, safe=False) - def _gen_model_y(self): # New - if self.model_y == 'auto': - model_y = SearchEstimatorList(estimator_list=WeightedLassoCVWrapper(random_state=self.random_state), param_grid_list=self.param_list_y, scoring=self.scoring_y, - scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state) - else: - model_y = clone(SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y, scoring=self.scoring_y, - scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state), safe=False) - # if self.model_y == 'auto': - # model_y = WeightedLassoCVWrapper(random_state=self.random_state) - # else: - # model_y = clone(self.model_y, safe=False) - return _FirstStageWrapper(model_y, True, self._gen_featurizer(), - self.linear_first_stages, self.discrete_treatment) - - def _gen_model_t(self): # New - if self.model_t == 'auto': - if self.discrete_treatment: - model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, scoring=self.scoring_t, - scaling=self.scaling, verbose=self.verbose, cv=WeightedStratifiedKFold(random_state=self.random_state), is_discrete=self.discrete_treatment, - n_jobs=self.n_jobs, random_state=self.random_state) - else: - model_t = SearchEstimatorList(estimator_list=WeightedLassoCVWrapper(random_state=self.random_state), param_grid_list=self.param_list_t, scoring=self.scoring_t, - scaling=self.scaling, verbose=self.verbose, cv=self.cv, is_discrete=self.discrete_treatment, - n_jobs=self.n_jobs, random_state=self.random_state) + def _gen_model_y(self): + return _make_first_stage_selector(self.model_y, False, self.random_state) - else: - model_t = clone(SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, - scaling=self.scaling, verbose=self.verbose, cv=self.cv, is_discrete=self.discrete_treatment, - n_jobs=self.n_jobs, random_state=self.random_state), safe=False) - # if self.model_t == 'auto': - # if self.discrete_treatment: - # model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - # random_state=self.random_state) - # else: - # model_t = WeightedLassoCVWrapper(random_state=self.random_state) - # else: - # model_t = clone(self.model_t, safe=False) - return _FirstStageWrapper(model_t, False, self._gen_featurizer(), - self.linear_first_stages, self.discrete_treatment) + def _gen_model_t(self): + return _make_first_stage_selector(self.model_t, self.discrete_treatment, self.random_state) def _gen_model_final(self): return clone(self.model_final, safe=False) @@ -1520,12 +1495,11 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y(self): - return _FirstStageWrapper(clone(self.model_y, safe=False), True, - self._gen_featurizer(), False, self.discrete_treatment) + return _make_first_stage_selector(self.model_y, is_discrete=False, random_state=self.random_state) def _gen_model_t(self): - return _FirstStageWrapper(clone(self.model_t, safe=False), False, - self._gen_featurizer(), False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t, is_discrete=self.discrete_treatment, + random_state=self.random_state) def _gen_model_final(self): return clone(self.model_final, safe=False) diff --git a/econml/dr/_drlearner.py b/econml/dr/_drlearner.py index 9b75ca75d..1f74890e0 100644 --- a/econml/dr/_drlearner.py +++ b/econml/dr/_drlearner.py @@ -43,6 +43,7 @@ LogisticRegressionCV) from sklearn.ensemble import RandomForestRegressor + from .._ortho_learner import _OrthoLearner from .._cate_estimator import (DebiasedLassoCateEstimatorDiscreteMixin, BaseCateEstimator, ForestModelFinalCateEstimatorDiscreteMixin, @@ -51,13 +52,17 @@ from ..grf import RegressionForest from ..sklearn_extensions.linear_model import ( DebiasedLasso, StatsModelsLinearRegression, WeightedLassoCVWrapper) +from ..sklearn_extensions.model_selection import ModelSelector, SingleModelSelector, get_selector from ..utilities import (_deprecate_positional, check_high_dimensional, - filter_none_kwargs, fit_with_groups, inverse_onehot, get_feature_names_or_default) + filter_none_kwargs, inverse_onehot, get_feature_names_or_default) from .._shap import _shap_explain_multitask_model_cate, _shap_explain_model_cate -class _ModelNuisance: - def __init__(self, model_propensity, model_regression, min_propensity): +class _ModelNuisance(ModelSelector): + def __init__(self, + model_propensity: SingleModelSelector, + model_regression: SingleModelSelector, + min_propensity): self._model_propensity = model_propensity self._model_regression = model_regression self._min_propensity = min_propensity @@ -65,7 +70,7 @@ def __init__(self, model_propensity, model_regression, min_propensity): def _combine(self, X, W): return np.hstack([arr for arr in [X, W] if arr is not None]) - def fit(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None): + def train(self, is_selecting, Y, T, X=None, W=None, *, sample_weight=None, groups=None): if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1): raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), " "instead got {1}.".format(len(X), Y.shape)) @@ -77,22 +82,16 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None): XW = self._combine(X, W) filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight) - fit_with_groups(self._model_propensity, XW, inverse_onehot(T), groups=groups, **filtered_kwargs) - fit_with_groups(self._model_regression, np.hstack([XW, T]), Y, groups=groups, **filtered_kwargs) + self._model_propensity.train(is_selecting, XW, inverse_onehot(T), groups=groups, **filtered_kwargs) + self._model_regression.train(is_selecting, np.hstack([XW, T]), Y, groups=groups, **filtered_kwargs) return self def score(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None): XW = self._combine(X, W) filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight) - if hasattr(self._model_propensity, 'score'): - propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs) - else: - propensity_score = None - if hasattr(self._model_regression, 'score'): - regression_score = self._model_regression.score(np.hstack([XW, T]), Y, **filtered_kwargs) - else: - regression_score = None + propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs) + regression_score = self._model_regression.score(np.hstack([XW, T]), Y, **filtered_kwargs) return propensity_score, regression_score @@ -114,6 +113,12 @@ def predict(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None): return Y_pred.reshape(Y.shape + (T.shape[1] + 1,)), propensities_weight.reshape((n,)) +def _make_first_stage_selector(model, is_discrete, random_state): + if model == "auto": + model = ['linear', 'forest'] + return get_selector(model, is_discrete=is_discrete, random_state=random_state) + + class _ModelFinal: # Coding Remark: The reasoning around the multitask_model_final could have been simplified if # we simply wrapped the model_final with a MultiOutputRegressor. However, because we also want @@ -499,16 +504,8 @@ def _get_inference_options(self): return options def _gen_ortho_learner_model_nuisance(self): - if self.model_propensity == 'auto': - model_propensity = LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto', - random_state=self.random_state) - else: - model_propensity = clone(self.model_propensity, safe=False) - - if self.model_regression == 'auto': - model_regression = WeightedLassoCVWrapper(cv=3, random_state=self.random_state) - else: - model_regression = clone(self.model_regression, safe=False) + model_propensity = _make_first_stage_selector(self.model_propensity, True, self.random_state) + model_regression = _make_first_stage_selector(self.model_regression, False, self.random_state) return _ModelNuisance(model_propensity, model_regression, self.min_propensity) @@ -648,7 +645,7 @@ def models_propensity(self): monte carlo iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_propensity for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_propensity.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_regression(self): @@ -662,7 +659,7 @@ def models_regression(self): monte carlo iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_regression for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_regression.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def nuisance_scores_propensity(self): diff --git a/econml/iv/dml/_dml.py b/econml/iv/dml/_dml.py index c8889599f..685ca0db7 100644 --- a/econml/iv/dml/_dml.py +++ b/econml/iv/dml/_dml.py @@ -24,17 +24,30 @@ from ..._cate_estimator import LinearModelFinalCateEstimatorMixin, StatsModelsCateEstimatorMixin, LinearCateEstimator from ...inference import StatsModelsInference, GenericSingleTreatmentModelFinalInference from ...sklearn_extensions.linear_model import StatsModels2SLS, StatsModelsLinearRegression, WeightedLassoCVWrapper -from ...sklearn_extensions.model_selection import WeightedStratifiedKFold +from ...sklearn_extensions.model_selection import (ModelSelector, SingleModelSelector, + WeightedStratifiedKFold, get_selector) from ...utilities import (_deprecate_positional, get_feature_names_or_default, filter_none_kwargs, add_intercept, cross_product, broadcast_unit_treatments, reshape_treatmentwise_effects, shape, parse_final_model_params, deprecated, Summary) -from ...dml.dml import _FirstStageWrapper, _FinalWrapper +from ...dml.dml import _make_first_stage_selector, _FinalWrapper from ...dml._rlearner import _ModelFinal from ..._shap import _shap_explain_joint_linear_model_cate, _shap_explain_model_cate -class _OrthoIVModelNuisance: - def __init__(self, model_y_xw, model_t_xw, model_z, projection): +def _combine(W, Z, n_samples): + if Z is not None: + Z = Z.reshape(n_samples, -1) + return Z if W is None else np.hstack([W, Z]) + return None if W is None else W + + +class _OrthoIVNuisanceSelector(ModelSelector): + + def __init__(self, + model_y_xw: SingleModelSelector, + model_t_xw: SingleModelSelector, + model_z: SingleModelSelector, + projection): self._model_y_xw = model_y_xw self._model_t_xw = model_t_xw self._projection = projection @@ -43,21 +56,15 @@ def __init__(self, model_y_xw, model_t_xw, model_z, projection): else: self._model_z_xw = model_z - def _combine(self, W, Z, n_samples): - if Z is not None: - Z = Z.reshape(n_samples, -1) - return Z if W is None else np.hstack([W, Z]) - return None if W is None else W - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): - self._model_y_xw.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) - self._model_t_xw.fit(X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + self._model_y_xw.train(is_selecting, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) + self._model_t_xw.train(is_selecting, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) if self._projection: # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) - self._model_t_xwz.fit(X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) + WZ = _combine(W, Z, Y.shape[0]) + self._model_t_xwz.train(is_selecting, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) else: - self._model_z_xw.fit(X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) + self._model_z_xw.train(is_selecting, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) return self def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): @@ -71,7 +78,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): T_X_score = None if self._projection: # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) if hasattr(self._model_t_xwz, 'score'): T_XZ_score = self._model_t_xwz.score(X=X, W=WZ, Target=T, sample_weight=sample_weight) else: @@ -91,7 +98,7 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None) if self._projection: # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) T_proj = self._model_t_xwz.predict(X, WZ) else: Z_pred = self._model_z_xw.predict(X=X, W=W) @@ -387,57 +394,29 @@ def _gen_ortho_learner_model_final(self): return _OrthoIVModelFinal(self._gen_model_final(), self._gen_featurizer(), self.fit_cate_intercept) def _gen_ortho_learner_model_nuisance(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) + model_y = _make_first_stage_selector(self.model_y_xw, + is_discrete=False, + random_state=self.random_state) - if self.model_t_xw == 'auto': - if self.discrete_treatment: - model_t_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xw = clone(self.model_t_xw, safe=False) + model_t = _make_first_stage_selector(self.model_t_xw, + is_discrete=self.discrete_treatment, + random_state=self.random_state) if self.projection: # train E[T|X,W,Z] - if self.model_t_xwz == 'auto': - if self.discrete_treatment: - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) - - return _OrthoIVModelNuisance(_FirstStageWrapper(clone(model_y_xw, safe=False), True, - self._gen_featurizer(), False, False), - _FirstStageWrapper(clone(model_t_xw, safe=False), False, - self._gen_featurizer(), False, self.discrete_treatment), - _FirstStageWrapper(clone(model_t_xwz, safe=False), False, - self._gen_featurizer(), False, self.discrete_treatment), - self.projection) + model_z = _make_first_stage_selector(self.model_t_xwz, + is_discrete=self.discrete_treatment, + random_state=self.random_state) else: - # train [Z|X,W] - if self.model_z_xw == "auto": - if self.discrete_instrument: - model_z_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_z_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_z_xw = clone(self.model_z_xw, safe=False) + # train E[Z|X,W] + # note: discrete_instrument rather than discrete_treatment in call to _make_first_stage_selector + model_z = _make_first_stage_selector(self.model_z_xw, + is_discrete=self.discrete_instrument, + random_state=self.random_state) - return _OrthoIVModelNuisance(_FirstStageWrapper(clone(model_y_xw, safe=False), True, - self._gen_featurizer(), False, False), - _FirstStageWrapper(clone(model_t_xw, safe=False), False, - self._gen_featurizer(), False, self.discrete_treatment), - _FirstStageWrapper(clone(model_z_xw, safe=False), False, - self._gen_featurizer(), False, self.discrete_instrument), - self.projection) + return _OrthoIVNuisanceSelector(model_y, model_t, model_z, + self.projection) def fit(self, Y, T, *, Z, X=None, W=None, sample_weight=None, freq_weight=None, sample_var=None, groups=None, cache_values=False, inference="auto"): @@ -717,29 +696,24 @@ def residuals_(self): return Y_res, T_res, Z_res, self._cached_values.X, self._cached_values.W, self._cached_values.Z -class _BaseDMLIVModelNuisance: +class _BaseDMLIVNuisanceSelector(ModelSelector): """ Nuisance model fits the three models at fit time and at predict time returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals. """ - def __init__(self, model_y_xw, model_t_xw, model_t_xwz): - self._model_y_xw = clone(model_y_xw, safe=False) - self._model_t_xw = clone(model_t_xw, safe=False) - self._model_t_xwz = clone(model_t_xwz, safe=False) - - def _combine(self, W, Z, n_samples): - if Z is not None: - Z = Z.reshape(n_samples, -1) - return Z if W is None else np.hstack([W, Z]) - return None if W is None else W + def __init__(self, model_y_xw: ModelSelector, model_t_xw: ModelSelector, model_t_xwz: ModelSelector): + self._model_y_xw = model_y_xw + self._model_t_xw = model_t_xw + self._model_t_xwz = model_t_xwz - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): - self._model_y_xw.fit(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) - self._model_t_xw.fit(X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + self._model_y_xw.train(is_selecting, X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + self._model_t_xw.train(is_selecting, X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) - self._model_t_xwz.fit(X, WZ, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + WZ = _combine(W, Z, Y.shape[0]) + self._model_t_xwz.train(is_selecting, X, WZ, T, + **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) return self def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): @@ -754,7 +728,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): T_X_score = None if hasattr(self._model_t_xwz, 'score'): # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) T_XZ_score = self._model_t_xwz.score(X, WZ, T, **filter_none_kwargs(sample_weight=sample_weight)) else: T_XZ_score = None @@ -764,7 +738,7 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None) # note that sample_weight and groups are not passed to predict because they are only used for fitting Y_pred = self._model_y_xw.predict(X, W) # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) TXZ_pred = self._model_t_xwz.predict(X, WZ) TX_pred = self._model_t_xw.predict(X, W) if (X is None) and (W is None): # In this case predict above returns a single row @@ -1183,42 +1157,19 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y_xw(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) - return _FirstStageWrapper(model_y_xw, True, self._gen_featurizer(), - False, False) + return _make_first_stage_selector(self.model_y_xw, False, self.random_state) def _gen_model_t_xw(self): - if self.model_t_xw == 'auto': - if self.discrete_treatment: - model_t_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xw = clone(self.model_t_xw, safe=False) - return _FirstStageWrapper(model_t_xw, False, self._gen_featurizer(), - False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t_xw, self.discrete_treatment, self.random_state) def _gen_model_t_xwz(self): - if self.model_t_xwz == 'auto': - if self.discrete_treatment: - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) - return _FirstStageWrapper(model_t_xwz, False, self._gen_featurizer(), - False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t_xwz, self.discrete_treatment, self.random_state) def _gen_model_final(self): return clone(self.model_final, safe=False) def _gen_ortho_learner_model_nuisance(self): - return _BaseDMLIVModelNuisance(self._gen_model_y_xw(), self._gen_model_t_xw(), self._gen_model_t_xwz()) + return _BaseDMLIVNuisanceSelector(self._gen_model_y_xw(), self._gen_model_t_xw(), self._gen_model_t_xwz()) def _gen_ortho_learner_model_final(self): return _BaseDMLIVModelFinal(_FinalWrapper(self._gen_model_final(), @@ -1579,42 +1530,19 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y_xw(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) - return _FirstStageWrapper(model_y_xw, True, self._gen_featurizer(), - False, False) + return _make_first_stage_selector(self.model_y_xw, False, self.random_state) def _gen_model_t_xw(self): - if self.model_t_xw == 'auto': - if self.discrete_treatment: - model_t_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xw = clone(self.model_t_xw, safe=False) - return _FirstStageWrapper(model_t_xw, False, self._gen_featurizer(), - False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t_xw, self.discrete_treatment, self.random_state) def _gen_model_t_xwz(self): - if self.model_t_xwz == 'auto': - if self.discrete_treatment: - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) - return _FirstStageWrapper(model_t_xwz, False, self._gen_featurizer(), - False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t_xwz, self.discrete_treatment, self.random_state) def _gen_model_final(self): return clone(self.model_final, safe=False) def _gen_ortho_learner_model_nuisance(self): - return _BaseDMLIVModelNuisance(self._gen_model_y_xw(), self._gen_model_t_xw(), self._gen_model_t_xwz()) + return _BaseDMLIVNuisanceSelector(self._gen_model_y_xw(), self._gen_model_t_xw(), self._gen_model_t_xwz()) def _gen_ortho_learner_model_final(self): return _BaseDMLIVModelFinal(_FinalWrapper(self._gen_model_final(), diff --git a/econml/iv/dr/_dr.py b/econml/iv/dr/_dr.py index c06df6278..648fbe63c 100644 --- a/econml/iv/dr/_dr.py +++ b/econml/iv/dr/_dr.py @@ -27,16 +27,23 @@ LinearCateEstimator) from ...inference import StatsModelsInference from ...sklearn_extensions.linear_model import StatsModelsLinearRegression, DebiasedLasso, WeightedLassoCVWrapper -from ...sklearn_extensions.model_selection import WeightedStratifiedKFold +from ...sklearn_extensions.model_selection import ModelSelector, SingleModelSelector, WeightedStratifiedKFold from ...utilities import (_deprecate_positional, add_intercept, filter_none_kwargs, inverse_onehot, get_feature_names_or_default, check_high_dimensional, check_input_arrays) from ...grf import RegressionForest -from ...dml.dml import _FirstStageWrapper, _FinalWrapper +from ...dml.dml import _make_first_stage_selector, _FinalWrapper from ...iv.dml import NonParamDMLIV from ..._shap import _shap_explain_model_cate -class _BaseDRIVModelNuisance: +def _combine(W, Z, n_samples): + if Z is not None: # Z will not be None + Z = Z.reshape(n_samples, -1) + return Z if W is None else np.hstack([W, Z]) + return None if W is None else W + + +class _BaseDRIVNuisanceSelector(ModelSelector): def __init__(self, *, prel_model_effect, model_y_xw, model_t_xw, model_tz_xw, model_z, projection, fit_cov_directly, discrete_treatment, discrete_instrument): @@ -53,22 +60,30 @@ def __init__(self, *, prel_model_effect, model_y_xw, model_t_xw, model_tz_xw, mo else: self._model_z_xw = model_z - def _combine(self, W, Z, n_samples): - if Z is not None: # Z will not be None - Z = Z.reshape(n_samples, -1) - return Z if W is None else np.hstack([W, Z]) - return None if W is None else W - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): # T and Z only allow single continuous or binary, keep the shape of (n,) for continuous and (n,1) for binary T = T.ravel() if not self._discrete_treatment else T Z = Z.ravel() if not self._discrete_instrument else Z - self._model_y_xw.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) - self._model_t_xw.fit(X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) + self._model_y_xw.train(is_selecting, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) + self._model_t_xw.train(is_selecting, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) + if is_selecting and self._fit_cov_directly: + # need to fit, too, since we call predict later inside this train method + self._model_t_xw.train(False, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) + + if self._projection: + WZ = _combine(W, Z, Y.shape[0]) + self._model_t_xwz.train(is_selecting, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) + if is_selecting: + # need to fit, too, since we call predict later inside this train method + self._model_t_xwz.train(False, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) + else: + self._model_z_xw.train(is_selecting, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) + if is_selecting: + # need to fit, too, since we call predict later inside this train method + self._model_z_xw.train(False, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) + if self._projection: - WZ = self._combine(W, Z, Y.shape[0]) - self._model_t_xwz.fit(X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) T_proj = self._model_t_xwz.predict(X, WZ).reshape(T.shape) if self._fit_cov_directly: # We're projecting, so we're treating E[T|X,Z] as the instrument (ignoring W for simplicity) @@ -82,15 +97,14 @@ def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): else: T_pred = T_pred.reshape(T.shape) target = (T_proj - T_pred)**2 - self._model_tz_xw.fit(X=X, W=W, Target=target, - sample_weight=sample_weight, groups=groups) + self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target, + sample_weight=sample_weight, groups=groups) else: # return shape (n,) target = (T * T_proj).reshape(T.shape[0],) - self._model_tz_xw.fit(X=X, W=W, Target=target, - sample_weight=sample_weight, groups=groups) + self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target, + sample_weight=sample_weight, groups=groups) else: - self._model_z_xw.fit(X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) if self._fit_cov_directly: Z_pred = self._model_z_xw.predict(X, W) T_pred = self._model_t_xw.predict(X, W) @@ -111,10 +125,10 @@ def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): target_shape = Z_res.shape if Z_res.ndim > 1 else T_res.shape target = T_res.reshape(target_shape) * Z_res.reshape(target_shape) # TODO: if the T and Z models overfit, then this will be biased towards 0; - # consider using nested cross-fitting here + # consider using nested cross-fitting # a similar comment applies to the projection case - self._model_tz_xw.fit(X=X, W=W, Target=target, - sample_weight=sample_weight, groups=groups) + self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target, + sample_weight=sample_weight, groups=groups) else: if self._discrete_treatment: if self._discrete_instrument: @@ -130,8 +144,8 @@ def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): else: # shape(n,) target = T * Z - self._model_tz_xw.fit(X=X, W=W, Target=target, - sample_weight=sample_weight, groups=groups) + self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target, + sample_weight=sample_weight, groups=groups) # TODO: prel_model_effect could allow sample_var and freq_weight? if self._discrete_instrument: @@ -168,7 +182,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): if self._projection: if hasattr(self._model_t_xwz, 'score'): - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) t_xwz_score = self._model_t_xwz.score(X=X, W=WZ, Target=T, sample_weight=sample_weight) else: t_xwz_score = None @@ -232,7 +246,7 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None) if self._projection: # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) T_proj = self._model_t_xwz.predict(X, WZ).reshape(T.shape) Z_res = T_proj - T_pred if self._fit_cov_directly: @@ -650,86 +664,38 @@ def _gen_prel_model_effect(self): return clone(self.prel_model_effect, safe=False) def _gen_ortho_learner_model_nuisance(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) - - if self.model_t_xw == 'auto': - if self.discrete_treatment: - model_t_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xw = clone(self.model_t_xw, safe=False) + model_y_xw = _make_first_stage_selector(self.model_y_xw, False, self.random_state) + model_t_xw = _make_first_stage_selector(self.model_t_xw, self.discrete_treatment, self.random_state) if self.projection: # this is a regression model since proj_t is probability - if self.model_tz_xw == "auto": - model_tz_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_tz_xw = clone(self.model_tz_xw, safe=False) + model_tz_xw = _make_first_stage_selector(self.model_tz_xw, + is_discrete=False, + random_state=self.random_state) - if self.model_t_xwz == 'auto': - if self.discrete_treatment: - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) - - return _BaseDRIVModelNuisance(prel_model_effect=self._gen_prel_model_effect(), - model_y_xw=_FirstStageWrapper( - model_y_xw, True, self._gen_featurizer(), False, False), - model_t_xw=_FirstStageWrapper(model_t_xw, False, self._gen_featurizer(), - False, self.discrete_treatment), - # outcome is continuous since proj_t is probability - model_tz_xw=_FirstStageWrapper(model_tz_xw, False, self._gen_featurizer(), - False, False), - model_z=_FirstStageWrapper(model_t_xwz, False, self._gen_featurizer(), - False, self.discrete_treatment), - projection=self.projection, - fit_cov_directly=self.fit_cov_directly, - discrete_treatment=self.discrete_treatment, - discrete_instrument=self.discrete_instrument) + # we're using E[T|X,W,Z] as the instrument + model_z = _make_first_stage_selector(self.model_t_xwz, + is_discrete=self.discrete_treatment, + random_state=self.random_state) else: - if self.model_tz_xw == "auto": - if self.discrete_treatment and self.discrete_instrument and not self.fit_cov_directly: - model_tz_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_tz_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_tz_xw = clone(self.model_tz_xw, safe=False) + model_tz_xw = _make_first_stage_selector(self.model_tz_xw, is_discrete=(self.discrete_treatment and + self.discrete_instrument and + not self.fit_cov_directly), + random_state=self.random_state) - if self.model_z_xw == 'auto': - if self.discrete_instrument: - model_z_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_z_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_z_xw = clone(self.model_z_xw, safe=False) - - return _BaseDRIVModelNuisance(prel_model_effect=self._gen_prel_model_effect(), - model_y_xw=_FirstStageWrapper( - model_y_xw, True, self._gen_featurizer(), False, False), - model_t_xw=_FirstStageWrapper(model_t_xw, False, self._gen_featurizer(), - False, self.discrete_treatment), - model_tz_xw=_FirstStageWrapper(model_tz_xw, False, self._gen_featurizer(), - False, (self.discrete_treatment and - self.discrete_instrument and - not self.fit_cov_directly)), - model_z=_FirstStageWrapper(model_z_xw, False, self._gen_featurizer(), - False, (self.discrete_instrument and - not self.fit_cov_directly)), - projection=self.projection, - fit_cov_directly=self.fit_cov_directly, - discrete_treatment=self.discrete_treatment, - discrete_instrument=self.discrete_instrument) + model_z = _make_first_stage_selector(self.model_z_xw, is_discrete=self.discrete_instrument, + random_state=self.random_state) + + return _BaseDRIVNuisanceSelector(prel_model_effect=self._gen_prel_model_effect(), + model_y_xw=model_y_xw, + model_t_xw=model_t_xw, + model_tz_xw=model_tz_xw, + model_z=model_z, + projection=self.projection, + fit_cov_directly=self.fit_cov_directly, + discrete_treatment=self.discrete_treatment, + discrete_instrument=self.discrete_instrument) class DRIV(_DRIV): @@ -2342,25 +2308,23 @@ def model_final(self, model): raise ValueError("Parameter `model_final` cannot be altered for this estimator!") -class _IntentToTreatDRIVModelNuisance: - def __init__(self, model_y_xw, model_t_xwz, dummy_z, prel_model_effect): - self._model_y_xw = clone(model_y_xw, safe=False) - self._model_t_xwz = clone(model_t_xwz, safe=False) - self._dummy_z = clone(dummy_z, safe=False) - self._prel_model_effect = clone(prel_model_effect, safe=False) - - def _combine(self, W, Z, n_samples): - if Z is not None: # Z will not be None - Z = Z.reshape(n_samples, -1) - return Z if W is None else np.hstack([W, Z]) - return None if W is None else W +class _IntentToTreatDRIVNuisanceSelector(ModelSelector): + def __init__(self, + model_y_xw: SingleModelSelector, + model_t_xwz: SingleModelSelector, + dummy_z: SingleModelSelector, + prel_model_effect): + self._model_y_xw = model_y_xw + self._model_t_xwz = model_t_xwz + self._dummy_z = dummy_z + self._prel_model_effect = prel_model_effect - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): - self._model_y_xw.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + self._model_y_xw.train(is_selecting, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) - self._model_t_xwz.fit(X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) - self._dummy_z.fit(X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) + WZ = _combine(W, Z, Y.shape[0]) + self._model_t_xwz.train(is_selecting, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) + self._dummy_z.train(is_selecting, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) # we need to undo the one-hot encoding for calling effect, # since it expects raw values self._prel_model_effect.fit(Y, inverse_onehot(T), Z=inverse_onehot(Z), X=X, W=W, @@ -2374,7 +2338,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): Y_X_score = None if hasattr(self._model_t_xwz, 'score'): # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) T_XZ_score = self._model_t_xwz.score(X=X, W=WZ, Target=T, sample_weight=sample_weight) else: T_XZ_score = None @@ -2390,8 +2354,8 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): Y_pred = self._model_y_xw.predict(X, W) - T_pred_zero = self._model_t_xwz.predict(X, self._combine(W, np.zeros(Z.shape), Y.shape[0])) - T_pred_one = self._model_t_xwz.predict(X, self._combine(W, np.ones(Z.shape), Y.shape[0])) + T_pred_zero = self._model_t_xwz.predict(X, _combine(W, np.zeros(Z.shape), Y.shape[0])) + T_pred_one = self._model_t_xwz.predict(X, _combine(W, np.ones(Z.shape), Y.shape[0])) Z_pred = self._dummy_z.predict(X, W) prel_theta = self._prel_model_effect.effect(X) @@ -2486,16 +2450,8 @@ def _gen_prel_model_effect(self): return clone(self.prel_model_effect, safe=False) def _gen_ortho_learner_model_nuisance(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) - - if self.model_t_xwz == 'auto': - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) + model_y_xw = _make_first_stage_selector(self.model_y_xw, is_discrete=False, random_state=self.random_state) + model_t_xwz = _make_first_stage_selector(self.model_t_xwz, is_discrete=True, random_state=self.random_state) if self.z_propensity == "auto": dummy_z = DummyClassifier(strategy="prior") @@ -2504,14 +2460,9 @@ def _gen_ortho_learner_model_nuisance(self): else: raise ValueError("Only 'auto' or float is allowed!") - return _IntentToTreatDRIVModelNuisance(_FirstStageWrapper(model_y_xw, True, self._gen_featurizer(), - False, False), - _FirstStageWrapper(model_t_xwz, False, - self._gen_featurizer(), False, True), - _FirstStageWrapper(dummy_z, False, - self._gen_featurizer(), False, True), - self._gen_prel_model_effect() - ) + dummy_z = _make_first_stage_selector(dummy_z, is_discrete=True, random_state=self.random_state) + + return _IntentToTreatDRIVNuisanceSelector(model_y_xw, model_t_xwz, dummy_z, self._gen_prel_model_effect()) class _DummyCATE: diff --git a/econml/new_tests/test_model_selection.py b/econml/new_tests/test_model_selection.py deleted file mode 100644 index 1eb82db0b..000000000 --- a/econml/new_tests/test_model_selection.py +++ /dev/null @@ -1,267 +0,0 @@ -import unittest - -import numpy as np -from econml.sklearn_extensions.model_selection import * -from econml.sklearn_extensions.model_selection_utils import * -from sklearn.datasets import fetch_california_housing, load_iris -from sklearn.preprocessing import StandardScaler -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score, f1_score -from sklearn.pipeline import make_pipeline -from sklearn.svm import SVR - - -class TestSearchEstimatorListClassifier(unittest.TestCase): - def setUp(self): - self.expected_accuracy = 0.9 - self.expected_f1_score = 0.9 - self.accuracy_tolerance = 0.05 - self.f1_score_tolerance = 0.05 - self.is_discrete = True - X, y = load_iris(return_X_y=True) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=42) - self.X_train = X_train - self.y_train = y_train - self.X_test = X_test - self.y_test = y_test - - def test_initialization(self): - with self.assertRaises(ValueError): - SearchEstimatorList(estimator_list='invalid_estimator') - - def test_auto_param_grid_discrete(self): - - search_estimator_list = SearchEstimatorList(is_discrete=self.is_discrete, scaling=False) - search_estimator_list.fit(self.X_train, self.y_train) - self.assertIsNotNone(search_estimator_list.best_estimator_) - self.assertIsNotNone(search_estimator_list.best_score_) - self.assertIsNotNone(search_estimator_list.best_params_) - - def test_linear_estimator(self): - search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_poly_estimator(self): - search = SearchEstimatorList(estimator_list='poly', is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertTrue(is_polynomial_pipeline(search.complete_estimator_list[0])) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_forest_estimator(self): - search = SearchEstimatorList(estimator_list='forest', is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], RandomForestClassifier) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_gbf_estimator(self): - search = SearchEstimatorList(estimator_list='gbf', is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], GradientBoostingClassifier) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_nnet_estimator(self): - search = SearchEstimatorList(estimator_list='nnet', is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], MLPClassifier) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_linear_and_forest_estimators(self): - search = SearchEstimatorList(estimator_list=['linear', 'forest'], is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 2) - self.assertEqual(len(search.param_grid_list), 2) - self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) - self.assertIsInstance(search.complete_estimator_list[1], RandomForestClassifier) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_all_estimators(self): - search = SearchEstimatorList(estimator_list=['linear', 'forest', - 'gbf', 'nnet', 'poly'], is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 5) - self.assertEqual(len(search.param_grid_list), 5) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_logistic_regression_estimator(self): - search = SearchEstimatorList(estimator_list=LogisticRegression(), is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_logistic_regression_cv_estimator(self): - search = SearchEstimatorList(estimator_list=LogisticRegressionCV(), - is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_empty_estimator_list(self): - with self.assertRaises(ValueError): - search = SearchEstimatorList(estimator_list=[], is_discrete=self.is_discrete, scaling=False) - - def test_invalid_regressor(self): - with self.assertRaises(TypeError): - estimator_list = [SVR(kernel='linear')] - search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) - - def test_polynomial_pipeline_regressor(self): - with self.assertRaises(TypeError): - estimator_list = [make_pipeline(PolynomialFeatures(), ElasticNetCV())] - search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) - - def test_mlp_regressor(self): - with self.assertRaises(TypeError): - estimator_list = [MLPRegressor()] - search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) - - def test_random_forest_regressor(self): - with self.assertRaises(TypeError): - estimator_list = [RandomForestRegressor()] - search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) - - def test_gradient_boosting_regressor(self): - with self.assertRaises(TypeError): - estimator_list = [GradientBoostingRegressor()] - search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) - - def test_combined_estimators(self): - with self.assertRaises(TypeError): - estimator_list = [LogisticRegression(), SVC(), GradientBoostingRegressor()] - search = SearchEstimatorList(estimator_list=estimator_list, is_discrete=self.is_discrete) - - def test_random_forest_discrete(self): - estimator_list = [RandomForestClassifier()] - param_grid_list = [{'n_estimators': [10, 50, 100], 'max_depth': [3, 5, None]}] - - search = SearchEstimatorList( - estimator_list=estimator_list, param_grid_list=param_grid_list, is_discrete=self.is_discrete, scaling=False) - search.fit(self.X_train, self.y_train) - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - - self.assertIsNotNone(search.best_estimator_) - self.assertIsNotNone(search.best_score_) - self.assertIsNotNone(search.best_params_) - - def test_data_scaling(self): - search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, scaling=True) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - def test_custom_scoring_function(self): - def custom_scorer(y_true, y_pred): - return f1_score(y_true, y_pred, average='macro') - - search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, - scaling=False, scoring=custom_scorer) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - # def test_refit_false(self): - # search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, scaling=False, refit=False) - # search.fit(self.X_train, self.y_train) - # with self.assertRaises(NotFittedError): - # y_pred = search.predict(self.X_test) - - def test_custom_random_state(self): - search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, - scaling=False, random_state=42) - search.fit(self.X_train, self.y_train) - y_pred = search.predict(self.X_test) - acc = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average='macro') - - self.assertEqual(len(search.complete_estimator_list), 1) - self.assertEqual(len(search.param_grid_list), 1) - self.assertIsInstance(search.complete_estimator_list[0], LogisticRegressionCV) - - self.assertGreaterEqual(acc, self.expected_accuracy) - self.assertGreaterEqual(f1, self.expected_f1_score) - - - def test_invalid_incorrect_scoring_numbers(self): - with self.assertRaises(ValueError): - search = SearchEstimatorList(estimator_list='linear', is_discrete=self.is_discrete, - scaling=False, scoring=123) - - -if __name__ == '__main__': - unittest.main() diff --git a/econml/new_tests/test_model_selection_utils.py b/econml/new_tests/test_model_selection_utils.py deleted file mode 100644 index 8e7e7c917..000000000 --- a/econml/new_tests/test_model_selection_utils.py +++ /dev/null @@ -1,235 +0,0 @@ -import unittest - -import numpy as np -from econml.sklearn_extensions.model_selection import * -from econml.sklearn_extensions.model_selection_utils import * -from sklearn.datasets import fetch_california_housing, load_iris -from sklearn.preprocessing import StandardScaler, PolynomialFeatures -from sklearn.model_selection import train_test_split -from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV - - -class TestIsDataScaled(unittest.TestCase): - - def test_scaled_data(self): - # Test with data that is already centered and scaled - X = np.array([[0.0, -1.0], [1.0, 0.0], [-1.0, 1.0]]) - scale = StandardScaler() - scaled_X = scale.fit_transform(X) - self.assertTrue(is_data_scaled(scaled_X)) - - def test_unscaled_data(self): - # Test with data that is not centered and scaled - X = np.array([[1.0, 2.0], [2.0, 3.0], [3.0, 4.0]]) - self.assertFalse(is_data_scaled(X)) - - def test_large_scaled_data(self): - # Test with a larger dataset that is already centered and scaled - np.random.seed(42) - X = np.random.randn(1000, 5) - scale = StandardScaler() - scaled_X = scale.fit_transform(X) - self.assertTrue(is_data_scaled(scaled_X)) - - def test_large_unscaled_data(self): - np.random.seed(42) - X = np.random.randn(1000, 5) - self.assertFalse(is_data_scaled(X)) - - def test_is_data_scaled_with_scaled_iris_dataset(self): - X, y = load_iris(return_X_y=True) - scaler = StandardScaler() - X_scaled = scaler.fit_transform(X) - assert is_data_scaled(X_scaled) == True - - def test_is_data_scaled_with_unscaled_iris_dataset(self): - X, y = load_iris(return_X_y=True) - assert is_data_scaled(X) == False - - def test_is_data_scaled_with_scaled_california_housing_dataset(self): - X, y = housing = fetch_california_housing(return_X_y=True) - scaler = StandardScaler() - X_scaled = scaler.fit_transform(X) - assert is_data_scaled(X_scaled) == True - - def test_is_data_scaled_with_unscaled_california_housing_dataset(self): - X, y = fetch_california_housing(return_X_y=True) - assert is_data_scaled(X) == False - - -class TestFlattenList(unittest.TestCase): - - def test_flatten_empty_list(self): - input = [] - expected_output = [] - self.assertEqual(flatten_list(input), expected_output) - - def test_flatten_simple_list(self): - input = [1, 10, 15] - expected_output = [1, 10, 15] - self.assertEqual(flatten_list(input), expected_output) - - def test_flatten_nested_list(self): - input = [1, [10, 15], [20, [25, 30]]] - expected_output = [1, 10, 15, 20, 25, 30] - self.assertEqual(flatten_list(input), expected_output) - - # Check functionality for below - # def test_flatten_none_list(self): - # input = [[1, 10, None], 15, None] - # expected_output = [1, 10, None, 15, None] - # self.assertEqual(flatten_list(input), expected_output) - - def test_flatten_iris_dataset(self): - X = load_iris() - input = X.data.tolist() - expected_output = sum(X.data.tolist(), []) - self.assertEqual(flatten_list(input), expected_output) - - def test_flatten_california_housing_dataset(self): - X = fetch_california_housing() - input = X.data.tolist() - expected_output = sum(X.data.tolist(), []) - self.assertEqual(flatten_list(input), expected_output) - - -class TestIsPolynomialPipeline(unittest.TestCase): - - def test_is_polynomial_pipeline_true(self): - X = np.array([[5, 10], [15, 20], [25, 30], [35, 40], [45, 50]]) - y = np.array([15, 29, 38, 47, 55]) - scaler = StandardScaler() - X_scaled = scaler.fit_transform(X) - model = Pipeline([ - ('poly', PolynomialFeatures(degree=2)), - ('linear', ElasticNetCV()) - ]) - model.fit(X_scaled, y) - assert is_polynomial_pipeline(model) == True - - def test_is_polynomial_pipeline_false(self): - model = ElasticNetCV() - assert is_polynomial_pipeline(model) == False - - def test_is_polynomial_pipeline_false_step_number(self): - X, y = load_iris(return_X_y=True) - model = Pipeline([ - ('poly', PolynomialFeatures(degree=2)), - ('linear', LogisticRegressionCV()), - ('step_false', '') - ]) - assert is_polynomial_pipeline(model) == False - - def test_is_polynomial_pipeline_interchange_steps(self): - X, y = load_iris(return_X_y=True) - model = Pipeline([ - ('poly', LogisticRegressionCV()), - ('linear', PolynomialFeatures(degree=2)), - ]) - assert is_polynomial_pipeline(model) == False - - # Cross-check functionaity - can the 'poly' keyword be changed to something else - def test_is_polynomial_pipeline_false_first_step(self): - X, y = fetch_california_housing(return_X_y=True) - model = Pipeline([ - ('not_poly', PolynomialFeatures(degree=2)), - ('linear', ElasticNetCV()) - ]) - assert is_polynomial_pipeline(model) == True - - -class TestCheckListType(unittest.TestCase): - - def test_check_list_type_true(self): - list = ['linear', LogisticRegressionCV(), KFold()] - assert check_list_type(list) == True - - def test_check_list_type_false_string(self): - list = [18, LogisticRegressionCV(), KFold()] - try: - check_list_type(list) - except TypeError as e: - assert str(e) == "The list must contain only strings, sklearn model objects, and sklearn model selection objects." - - def test_check_list_type_empty(self): - list = [] - try: - check_list_type(list) - except ValueError as e: - assert str(e) == "Estimator list is empty. Please add some models or use some of the defaults provided." - - def test_check_list_type_all_strings(self): - list = ['linear', 'lasso', 'forest'] - assert check_list_type(list) == True - - def test_check_list_type_all_models(self): - list = [LogisticRegressionCV(), ElasticNetCV()] - assert check_list_type(list) == True - - def test_check_list_duplicate_models_strings(self): - list = [LogisticRegressionCV(), LogisticRegressionCV(), 'linear', 'linear'] - assert check_list_type(list) == True - - -class TestSelectContinuousEstimator(unittest.TestCase): - - def test_select_continuous_estimator_valid(self): - assert isinstance(select_continuous_estimator('linear'), ElasticNetCV) - assert isinstance(select_continuous_estimator('forest'), RandomForestRegressor) - assert isinstance(select_continuous_estimator('gbf'), GradientBoostingRegressor) - assert isinstance(select_continuous_estimator('nnet'), MLPRegressor) - assert isinstance(select_continuous_estimator('poly'), Pipeline) - - def test_select_continuous_estimator_invalid(self): - try: - select_continuous_estimator('ridge') - except ValueError as e: - assert str(e) == 'Unsupported estimator type: ridge' - - -class TestSelectDiscreteEstimator(unittest.TestCase): - - def test_select_discrete_estimator_valid(self): - assert isinstance(select_discrete_estimator('linear'), LogisticRegressionCV) - assert isinstance(select_discrete_estimator('forest'), RandomForestClassifier) - assert isinstance(select_discrete_estimator('gbf'), GradientBoostingClassifier) - assert isinstance(select_discrete_estimator('nnet'), MLPClassifier) - assert isinstance(select_discrete_estimator('poly'), Pipeline) - - def test_select_discrete_estimator_invalid(self): - try: - select_discrete_estimator('lasso') - except ValueError as e: - assert str(e) == 'Unsupported estimator type: lasso' - - -class TestSelectEstimator(unittest.TestCase): - - def test_select_estimator_valid(self): - assert isinstance(select_estimator('linear', is_discrete=False), ElasticNetCV) - assert isinstance(select_estimator('forest', is_discrete=False), RandomForestRegressor) - assert isinstance(select_estimator('gbf', is_discrete=False), GradientBoostingRegressor) - assert isinstance(select_estimator('nnet', is_discrete=False), MLPRegressor) - assert isinstance(select_estimator('poly', is_discrete=False), Pipeline) - - assert isinstance(select_estimator('linear', is_discrete=True), LogisticRegression) - assert isinstance(select_estimator('forest', is_discrete=True), RandomForestClassifier) - assert isinstance(select_estimator('gbf', is_discrete=True), GradientBoostingClassifier) - assert isinstance(select_estimator('nnet', is_discrete=True), MLPClassifier) - assert isinstance(select_estimator('poly', is_discrete=True), Pipeline) - - def test_select_estimator_invalid_estimator(self): - try: - select_estimator('lasso', is_discrete=True) - except ValueError as e: - assert str(e) == 'Unsupported estimator type: lasso' - - def test_select_estimator_invalid(self): - try: - select_estimator('linear', is_discrete=None) - except ValueError as e: - assert str(e) == 'Unsupported target type: None' - - -if __name__ == '__main__': - unittest.main() diff --git a/econml/panel/dml/_dml.py b/econml/panel/dml/_dml.py index c3dc96a4e..10ce615c5 100644 --- a/econml/panel/dml/_dml.py +++ b/econml/panel/dml/_dml.py @@ -9,13 +9,13 @@ from scipy.stats import norm from sklearn.linear_model import (ElasticNetCV, LassoCV, LogisticRegressionCV) from ...sklearn_extensions.linear_model import (StatsModelsLinearRegression, WeightedLassoCVWrapper) -from ...sklearn_extensions.model_selection import WeightedStratifiedKFold -from ...dml.dml import _FirstStageWrapper, _FinalWrapper +from ...sklearn_extensions.model_selection import ModelSelector, WeightedStratifiedKFold +from ...dml.dml import _make_first_stage_selector, _FinalWrapper from ..._cate_estimator import TreatmentExpansionMixin, LinearModelFinalCateEstimatorMixin from ..._ortho_learner import _OrthoLearner from ...utilities import (_deprecate_positional, add_intercept, broadcast_unit_treatments, check_high_dimensional, - cross_product, deprecated, fit_with_groups, + cross_product, deprecated, hstack, inverse_onehot, ndim, reshape, reshape_treatmentwise_effects, shape, transpose, get_feature_names_or_default, check_input_arrays, @@ -33,7 +33,7 @@ def _get_groups_period_filter(groups, n_periods): return group_period_filter -class _DynamicModelNuisance: +class _DynamicModelNuisanceSelector(ModelSelector): """ Nuisance model fits the model_y and model_t at fit time and at predict time calculates the residual Y and residual T based on the fitted models and returns @@ -45,21 +45,27 @@ def __init__(self, model_y, model_t, n_periods): self._model_t = model_t self.n_periods = n_periods - def fit(self, Y, T, X=None, W=None, sample_weight=None, groups=None): + def train(self, is_selecting, Y, T, X=None, W=None, sample_weight=None, groups=None): """Fit a series of nuisance models for each period or period pairs.""" assert Y.shape[0] % self.n_periods == 0, \ "Length of training data should be an integer multiple of time periods." period_filters = _get_groups_period_filter(groups, self.n_periods) - self._model_y_trained = {} - self._model_t_trained = {j: {} for j in np.arange(self.n_periods)} + if is_selecting: # create the per-period y and t models + self._model_y_trained = {t: clone(self._model_y, safe=False) + for t in np.arange(self.n_periods)} + self._model_t_trained = {j: {t: clone(self._model_t, safe=False) + for t in np.arange(j + 1)} + for j in np.arange(self.n_periods)} for t in np.arange(self.n_periods): - self._model_y_trained[t] = clone(self._model_y, safe=False).fit( + self._model_y_trained[t].train( + is_selecting, self._index_or_None(X, period_filters[t]), self._index_or_None( W, period_filters[t]), Y[period_filters[self.n_periods - 1]]) for j in np.arange(t, self.n_periods): - self._model_t_trained[j][t] = clone(self._model_t, safe=False).fit( + self._model_t_trained[j][t].train( + is_selecting, self._index_or_None(X, period_filters[t]), self._index_or_None(W, period_filters[t]), T[period_filters[j]]) @@ -534,30 +540,18 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y(self): - if self.model_y == 'auto': - model_y = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y = clone(self.model_y, safe=False) - return _FirstStageWrapper(model_y, True, self._gen_featurizer(), - self.linear_first_stages, self.discrete_treatment) + return _make_first_stage_selector(self.model_y, is_discrete=False, random_state=self.random_state) def _gen_model_t(self): - if self.model_t == 'auto': - if self.discrete_treatment: - model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t = clone(self.model_t, safe=False) - return _FirstStageWrapper(model_t, False, self._gen_featurizer(), - self.linear_first_stages, self.discrete_treatment) + return _make_first_stage_selector(self.model_t, + is_discrete=self.discrete_treatment, + random_state=self.random_state) def _gen_model_final(self): return StatsModelsLinearRegression(fit_intercept=False) def _gen_ortho_learner_model_nuisance(self): - return _DynamicModelNuisance( + return _DynamicModelNuisanceSelector( model_t=self._gen_model_t(), model_y=self._gen_model_y(), n_periods=self._n_periods) diff --git a/econml/sklearn_extensions/linear_model.py b/econml/sklearn_extensions/linear_model.py index 0c90c6868..8045d23bf 100644 --- a/econml/sklearn_extensions/linear_model.py +++ b/econml/sklearn_extensions/linear_model.py @@ -20,8 +20,7 @@ import warnings from collections.abc import Iterable from scipy.stats import norm -from econml.sklearn_extensions.model_selection import WeightedKFold, WeightedStratifiedKFold -from econml.utilities import ndim, shape, reshape, _safe_norm_ppf, check_input_arrays +from ..utilities import ndim, shape, reshape, _safe_norm_ppf, check_input_arrays from sklearn import clone from sklearn.linear_model import LinearRegression, LassoCV, MultiTaskLassoCV, Lasso, MultiTaskLasso from sklearn.linear_model._base import _preprocess_data @@ -41,7 +40,24 @@ from typing import List +class _WeightedCVIterableWrapper(_CVIterableWrapper): + def __init__(self, cv): + super().__init__(cv) + + def get_n_splits(self, X=None, y=None, groups=None, sample_weight=None): + if groups is not None and sample_weight is not None: + raise ValueError("Cannot simultaneously use grouping and weighting") + return super().get_n_splits(X, y, groups) + + def split(self, X=None, y=None, groups=None, sample_weight=None): + if groups is not None and sample_weight is not None: + raise ValueError("Cannot simultaneously use grouping and weighting") + return super().split(X, y, groups) + + def _weighted_check_cv(cv=5, y=None, classifier=False, random_state=None): + # local import to avoid circular imports + from .model_selection import WeightedKFold, WeightedStratifiedKFold cv = 5 if cv is None else cv if isinstance(cv, numbers.Integral): if (classifier and (y is not None) and @@ -60,21 +76,6 @@ def _weighted_check_cv(cv=5, y=None, classifier=False, random_state=None): return cv # New style cv objects are passed without any modification -class _WeightedCVIterableWrapper(_CVIterableWrapper): - def __init__(self, cv): - super().__init__(cv) - - def get_n_splits(self, X=None, y=None, groups=None, sample_weight=None): - if groups is not None and sample_weight is not None: - raise ValueError("Cannot simultaneously use grouping and weighting") - return super().get_n_splits(X, y, groups) - - def split(self, X=None, y=None, groups=None, sample_weight=None): - if groups is not None and sample_weight is not None: - raise ValueError("Cannot simultaneously use grouping and weighting") - return super().split(X, y, groups) - - class WeightedModelMixin: """Mixin class for weighted models. @@ -1204,73 +1205,90 @@ def _set_attribute(self, attribute_name, condition=True, default=None): setattr(self, attribute_name, attribute_value) -class WeightedLassoCVWrapper: - """Helper class to wrap either WeightedLassoCV or WeightedMultiTaskLassoCV depending on the shape of the target.""" +class _PairedEstimatorWrapper: + """Helper class to wrap two different estimators, one of which can be used only with single targets and the other + which can be used on multiple targets. Not intended to be used directly by users.""" + + _SingleEst = None + _MultiEst = None + _known_params = [] + _post_fit_attrs = [] def __init__(self, *args, **kwargs): self.args = args self.kwargs = kwargs - # set model to WeightedLassoCV by default so there's always a model to get and set attributes on - self.model = WeightedLassoCV(*args, **kwargs) - - # whitelist known params because full set is not necessarily identical between LassoCV and MultiTaskLassoCV - # (e.g. former has 'positive' and 'precompute' while latter does not) - known_params = set(['eps', 'n_alphas', 'alphas', 'fit_intercept', 'normalize', 'max_iter', 'tol', 'copy_X', - 'cv', 'verbose', 'n_jobs', 'random_state', 'selection']) + # set model to the single-target estimator by default so there's always a model to get and set attributes on + self.model = self._SingleEst(*args, **kwargs) def fit(self, X, y, sample_weight=None): - self.needs_unravel = False + self._needs_unravel = False params = {key: value for (key, value) in self.get_params().items() - if key in self.known_params} + if key in self._known_params} if ndim(y) == 2 and shape(y)[1] > 1: - self.model = WeightedMultiTaskLassoCV(**params) + self.model = self._MultiEst(**params) else: if ndim(y) == 2 and shape(y)[1] == 1: y = np.ravel(y) - self.needs_unravel = True - self.model = WeightedLassoCV(**params) + self._needs_unravel = True + self.model = self._SingleEst(**params) self.model.fit(X, y, sample_weight) - # set intercept_ attribute - self.intercept_ = self.model.intercept_ - # set coef_ attribute - self.coef_ = self.model.coef_ - # set alpha_ attribute - self.alpha_ = self.model.alpha_ - # set alphas_ attribute - self.alphas_ = self.model.alphas_ - # set n_iter_ attribute - self.n_iter_ = self.model.n_iter_ + for param in self._post_fit_attrs: + setattr(self, param, getattr(self.model, param)) return self def predict(self, X): predictions = self.model.predict(X) - return reshape(predictions, (-1, 1)) if self.needs_unravel else predictions + return reshape(predictions, (-1, 1)) if self._needs_unravel else predictions def score(self, X, y, sample_weight=None): return self.model.score(X, y, sample_weight) def __getattr__(self, key): - if key in self.known_params: + if key in self._known_params: return getattr(self.model, key) else: raise AttributeError("No attribute " + key) def __setattr__(self, key, value): - if key in self.known_params: + if key in self._known_params: setattr(self.model, key, value) else: super().__setattr__(key, value) def get_params(self, deep=True): """Get parameters for this estimator.""" - return self.model.get_params(deep=deep) + return {k: v for k, v in self.model.get_params(deep=deep).items() if k in self._known_params} def set_params(self, **params): """Set parameters for this estimator.""" self.model.set_params(**params) +class WeightedLassoCVWrapper(_PairedEstimatorWrapper): + """Helper class to wrap either WeightedLassoCV or WeightedMultiTaskLassoCV depending on the shape of the target.""" + + _SingleEst = WeightedLassoCV + _MultiEst = WeightedMultiTaskLassoCV + + # whitelist known params because full set is not necessarily identical between LassoCV and MultiTaskLassoCV + # (e.g. former has 'positive' and 'precompute' while latter does not) + _known_params = set(['eps', 'n_alphas', 'alphas', 'fit_intercept', 'normalize', 'max_iter', 'tol', 'copy_X', + 'cv', 'verbose', 'n_jobs', 'random_state', 'selection']) + + _post_fit_attrs = set(['alpha_', 'alphas_', 'coef_', 'dual_gap_', 'intercept_', 'n_iter_', 'n_features_in_']) + + +class WeightedLassoWrapper(_PairedEstimatorWrapper): + """Helper class to wrap either WeightedLasso or WeightedMultiTaskLasso depending on the shape of the target.""" + + _SingleEst = WeightedLasso + _MultiEst = WeightedMultiTaskLasso + _known_params = set(['alpha', 'fit_intercept', 'copy_X', 'max_iter', 'tol', + 'random_state', 'selection']) + _post_fit_attrs = set(['coef_', 'dual_gap_', 'intercept_', 'n_iter_', 'n_features_in_']) + + class SelectiveRegularization: """ Estimator of a linear model where regularization is applied to only a subset of the coefficients. diff --git a/econml/sklearn_extensions/model_selection.py b/econml/sklearn_extensions/model_selection.py index d8c55538d..0572a9717 100644 --- a/econml/sklearn_extensions/model_selection.py +++ b/econml/sklearn_extensions/model_selection.py @@ -3,27 +3,36 @@ """Collection of scikit-learn extensions for model selection techniques.""" import numbers -import pdb import warnings +import abc import numpy as np +from collections.abc import Iterable import scipy.sparse as sp import sklearn from joblib import Parallel, delayed from sklearn.base import BaseEstimator, clone, is_classifier +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.exceptions import FitFailedWarning -from sklearn.model_selection import (BaseCrossValidator, GridSearchCV, KFold, +from sklearn.linear_model import (ElasticNet, ElasticNetCV, Lasso, LassoCV, MultiTaskElasticNet, MultiTaskElasticNetCV, + MultiTaskLasso, MultiTaskLassoCV, Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV, + LogisticRegression, LogisticRegressionCV) +from sklearn.model_selection import (BaseCrossValidator, GridSearchCV, GroupKFold, KFold, RandomizedSearchCV, StratifiedKFold, check_cv) # TODO: conisder working around relying on sklearn implementation details from sklearn.model_selection._validation import (_check_is_permutation, _fit_and_predict) -from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.utils import check_random_state, indexable from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _num_samples -from econml.sklearn_extensions.model_selection_utils import * +from .linear_model import WeightedLassoCVWrapper, WeightedLassoWrapper +from .model_selection_utils import (auto_hyperparameters, can_handle_multitask, get_complete_estimator_list, + has_random_state, is_data_scaled, is_likely_multi_task, + is_mlp, is_polynomial_pipeline, just_one_model_no_params, make_model_multi_task, + make_param_multi_task, param_grid_is_empty, supports_sample_weight) def _split_weighted_sample(self, X, y, sample_weight, is_stratified=False): @@ -261,11 +270,295 @@ def get_n_splits(self, X, y, groups=None): return self.n_splits +class ModelSelector(metaclass=abc.ABCMeta): + """ + This class enables a two-stage fitting process, where first a model is selected + by calling `train` with `is_selecting=True`, and then the selected model is fit (presumably + on a different data set) by calling train with `is_selecting=False`. + + + """ + + @abc.abstractmethod + def train(self, is_selecting: bool, *args, **kwargs): + """ + Either selects a model or fits a model, depending on the value of `is_selecting`. + """ + raise NotImplementedError("Abstract method") + + @abc.abstractmethod + def predict(self, *args, **kwargs): + """ + Predicts using the selected model; should not be called until after `train` has been used + both to select a model and to fit it. + """ + raise NotImplementedError("Abstract method") + + @abc.abstractmethod + def score(self, *args, **kwargs): + """ + Gets the score of the selected model on the given data; should not be called until after `train` has been used + both to select a model and to fit it. + """ + raise NotImplementedError("Abstract method") + + +class SingleModelSelector(ModelSelector): + """ + A model selection class that selects a single best model; + this encompasses random search, grid search, ensembling, etc. + """ + + @property + @abc.abstractmethod + def best_model(self): + raise NotImplementedError("Abstract method") + + @property + @abc.abstractmethod + def best_score(self): + raise NotImplementedError("Abstract method") + + def predict(self, *args, **kwargs): + return self.best_model.predict(*args, **kwargs) + + def predict_proba(self, *args, **kwargs): + return self.best_model.predict_proba(*args, **kwargs) + + def score(self, *args, **kwargs): + if hasattr(self.best_model, 'score'): + return self.best_model.score(*args, **kwargs) + else: + return None + + +def _fit_with_groups(model, X, y, *, groups, **kwargs): + """ + Fits a model while correctly handling grouping if necessary. + + This enables us to perform an inner-loop cross-validation of a model + which handles grouping correctly, which is not easy using typical sklearn models. + + For example, GridSearchCV and RandomSearchCV both support passing `groups` to fit, + but other CV-related estimators (e.g. LassoCV) do not, which means that GroupKFold + cannot be used as the cv instance, because the `groups` argument will never be passed through + to GroupKFold's `split` method. + + The hacky workaround here is to explicitly set the `cv` attribute to the set of + rows that GroupKFold would have generated rather than using GroupKFold as the cv instance. + """ + if groups is not None: + if hasattr(model, 'cv'): + old_cv = model.cv + # logic copied from check_cv + cv = 5 if old_cv is None else old_cv + if isinstance(cv, numbers.Integral): + cv = GroupKFold(cv) + # otherwise we will assume the user already set the cv attribute to something + # compatible with splitting with a `groups` argument + + splits = list(cv.split(X, y, groups=groups)) + try: + model.cv = splits + return model.fit(X, y, **kwargs) # drop groups from arg list + finally: + model.cv = old_cv + + # drop groups from arg list, which were already used at the outer level and may not be supported by the model + return model.fit(X, y, **kwargs) + + +class FixedModelSelector(SingleModelSelector): + """ + Model selection class that always selects the given model + """ + + def __init__(self, model): + self.model = clone(model, safe=False) + + def train(self, is_selecting, *args, groups=None, **kwargs): + # whether selecting or not, need to train the model on the data + _fit_with_groups(self.model, *args, groups=groups, **kwargs) + if is_selecting and hasattr(self.model, 'score'): + self._score = self.model.score(*args, **kwargs) + return self + + @property + def best_model(self): + return self.model + + @property + def best_score(self): + return self._score + + +class SklearnCVSelector(SingleModelSelector): + """ + Wraps one of sklearn's CV classes in the ModelSelector interface + """ + + def __init__(self, searcher): + self.searcher = clone(searcher) + + @staticmethod + def convertible_types(): + return {GridSearchCV, RandomizedSearchCV} | SklearnCVSelector._model_mapping().keys() + + @staticmethod + def can_wrap(model): + return any(isinstance(model, model_type) for model_type in SklearnCVSelector.convertible_types()) + + @staticmethod + def _model_mapping(): + return {LogisticRegressionCV: (LogisticRegression, + ["C", "l1_ratio"], + [], + ["classes_", "coef_", "intercept_", "n_features_in_", "n_iter_"]), + ElasticNetCV: (ElasticNet, + ["alpha", "l1_ratio"], + ["precompute"], + ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), + LassoCV: (Lasso, + ["alpha"], + ["precompute"], + ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), + RidgeCV: (Ridge, + ["alpha"], + [], + ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), + RidgeClassifierCV: (RidgeClassifier, + ["alpha"], + [], + ["label_binarizer", "coef_", "intercept_", "n_features_in_", "n_iter_"]), + MultiTaskElasticNetCV: (MultiTaskElasticNet, + ["alpha", "l1_ratio"], + ["precompute"], + ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), + MultiTaskLassoCV: (MultiTaskLasso, + ["alpha"], + [], + ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]), + WeightedLassoCVWrapper: (WeightedLassoWrapper, + ["alpha"], + [], + ["coef_", "intercept_", "dual_gap_", "n_features_in_", "n_iter_"]) + } + + def train(self, is_selecting: bool, *args, groups=None, **kwargs): + if is_selecting: + + _fit_with_groups(self.searcher, *args, groups=groups, **kwargs) + self._best_model = self._extract_best_model() + # TODO: ideally, want the out-of-sample score here instead; + # but this is not exposed in a consistent way + self._best_score = self.searcher.score(*args, **kwargs) + else: + # don't need to use _fit_with_groups here since none of these models support it + self.best_model.fit(*args, **kwargs) + return self + + @property + def best_model(self): + return self._best_model + + @property + def best_score(self): + return self._best_score + + def _extract_best_model(self): + if isinstance(self.searcher, GridSearchCV) or isinstance(self.searcher, RandomizedSearchCV): + return self.searcher.best_estimator_ + else: + for known_type in self._model_mapping().keys(): + if isinstance(self.searcher, known_type): + model_type, opt_params, strip_params, fit_vars = self._model_mapping()[known_type] + model = model_type() + # set all shared parameters + for param in model.get_params().keys() & self.searcher.get_params().keys() - set(strip_params): + setattr(model, param, getattr(self.searcher, param)) + # update learned hyperparameters with best values + for param in opt_params: + setattr(model, param, getattr(self.searcher, param + "_")) + # set all fitted variables + for var in fit_vars: + setattr(model, var, getattr(self.searcher, var)) + return model + raise ValueError(f"Unsupported type: {type(self.searcher)}") + + +class ListSelector(SingleModelSelector): + """ + Model selection class that selects the best model from a list of model selectors + + Parameters + ---------- + models : list of ModelSelector + The list of model selectors to choose from + unwrap : bool, default True + Whether to return the best model's best model, rather than just the outer best model selector + """ + + def __init__(self, models, unwrap=True): + self.models = [clone(model, safe=False) for model in models] + self.unwrap = unwrap + + def train(self, is_selecting, *args, **kwargs): + if is_selecting: + scores = [] + for model in self.models: + model.train(is_selecting, *args, **kwargs) + scores.append(model.best_score) + + self._best_score = np.max(scores) + self._best_model = self.models[np.argmax(scores)] + + else: + self._best_model.train(is_selecting, *args, **kwargs) + + @property + def best_model(self): + """ + Gets the best model; note that if we were selecting over SingleModelSelectors and `unwrap` is `False`, + we will return the SingleModelSelector instance, not its best model. + """ + return self._best_model.best_model if self.unwrap else self._best_model + + @property + def best_score(self): + return self._best_score + + +def get_selector(input, is_discrete, *, random_state=None, cv=None, wrapper=GridSearchCV): + named_models = { + 'linear': (LogisticRegressionCV(random_state=random_state, cv=cv) if is_discrete + else WeightedLassoCVWrapper(random_state=random_state, cv=cv)), + 'forest': (RandomForestClassifier(random_state=random_state) if is_discrete + else RandomForestRegressor(random_state=random_state)), + } + if isinstance(input, ModelSelector): # we've already got a model selector, don't need to do anything + return input + elif isinstance(input, list): # we've got a list; call get_selector on each element, then wrap in a ListSelector + models = [get_selector(model, is_discrete, + random_state=random_state, cv=cv, wrapper=wrapper) + for model in input] + return ListSelector(models) + elif isinstance(input, str): # we've got a string; look it up + if input in named_models: + return get_selector(named_models[input], is_discrete, + random_state=random_state, cv=cv, wrapper=wrapper) + else: + raise ValueError(f"Unknown model type: {input}, must be one of {named_models.keys()}") + elif SklearnCVSelector.can_wrap(input): + return SklearnCVSelector(input) + else: # assume this is an sklearn-compatible model + return FixedModelSelector(input) + + class SearchEstimatorList(BaseEstimator): """ The SearchEstimatorList is a utility class for hyperparameter tuning. - It provides a convenient way to perform GridSearch cross-validation for - a list of estimators. The class automates the process of hyperparameter + It provides a convenient way to perform GridSearch cross-validation for + a list of estimators. The class automates the process of hyperparameter tuning, model fitting, and prediction for multiple estimators. @@ -275,7 +568,8 @@ class SearchEstimatorList(BaseEstimator): A list of names of estimators to be used for grid search. param_grid_list : list or 'auto', default 'auto' - A list of dictionaries specifying hyperparameters for each estimator in `estimator_list`. If set to 'auto', the class automatically generates hyperparameters for the estimators. + A list of dictionaries specifying hyperparameters for each estimator in `estimator_list`. If set to 'auto', + the class automatically generates hyperparameters for the estimators. scaling : bool, default True Indicates whether to scale the input data using StandardScaler. @@ -304,32 +598,35 @@ class SearchEstimatorList(BaseEstimator): random_state : int, RandomState instance, or None, default None If int, `random_state` is the seed used by the random number generator; If `RandomState` instance, `random_state` is the random number generator; - If None, the random number generator is the `RandomState` instance used by `np.random`. Used when `shuffle` == True. + If None, the random number generator is the `RandomState` instance used by `np.random`. + Used when `shuffle` == True. error_score : float or 'raise', default np.nan - The value assigned to the score if an error occurs during fitting an estimator. If set to 'raise', an error is raised. + The value assigned to the score if an error occurs during fitting an estimator. If set to 'raise', + an error is raised. return_train_score : bool, default False Determines whether to include training scores in the `cv_results_` attribute of the class. categorical_indices : str, int, list, or None default None - List of categorical indices + List of categorical indices """ - def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, scaling=False, is_discrete=False, scoring=None, - n_jobs=None, refit=True, cv=2, verbose=2, pre_dispatch='2*n_jobs', random_state=None, + def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, scaling=False, + is_discrete=False, scoring=None, n_jobs=None, refit=True, cv=2, verbose=2, + pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=False, categorical_indices=None): - # pdb.set_trace() self.estimator_list = estimator_list self.complete_estimator_list = get_complete_estimator_list( clone(estimator_list, safe=False), is_discrete=is_discrete, random_state=random_state) - # TODO Add in more functionality by checking if it's an empty list. If it's just 1 dictionary then we're going to need to turn it into a list + # TODO Add in more functionality by checking if it's an empty list. If it's just 1 dictionary + # then we're going to need to turn it into a list # Just do more cases if param_grid_list == 'auto': self.param_grid_list = auto_hyperparameters( estimator_list=self.complete_estimator_list, is_discrete=is_discrete) - elif (param_grid_list == None): + elif (param_grid_list is None): self.param_grid_list = len(self.complete_estimator_list) * [{}] else: if isinstance(param_grid_list, dict): @@ -338,7 +635,7 @@ def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, sc self.param_grid_list = param_grid_list self.categorical_indices = categorical_indices self.scoring = scoring - if scoring == None: + if scoring is None: if is_discrete: self.scoring = 'f1_macro' else: @@ -357,10 +654,6 @@ def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, sc self.supported_models = ['linear', 'forest', 'gbf', 'nnet', 'poly'] def fit(self, X, y, *, sample_weight=None, groups=None): - # print(groups) - # if groups != None: - # pdb.set_trace() - # pdb.set_trace() self._search_list = [] # Change estimators if multi_task @@ -369,7 +662,7 @@ def fit(self, X, y, *, sample_weight=None, groups=None): if not can_handle_multitask(model=estimator, is_discrete=self.is_discrete): self.complete_estimator_list[index] = make_model_multi_task( model=estimator, is_discrete=self.is_discrete) - if self.param_grid_list != None: + if self.param_grid_list is not None: self.param_grid_list[index] = make_param_multi_task( estimator=estimator, param_grid=self.param_grid_list[index]) @@ -381,9 +674,10 @@ def fit(self, X, y, *, sample_weight=None, groups=None): if just_one_model_no_params(estimator_list=self.complete_estimator_list, param_list=self.param_grid_list): # Just fit the model and return it, no need for grid search or for loop estimator = self.complete_estimator_list[0] - if self.random_state != None: + if self.random_state is not None: if has_random_state(model=estimator): - # For a polynomial pipeline, you have to set the random state of the linear part, the polynomial part doesn't have random state + # For a polynomial pipeline, you have to set the random state of the linear part, + # the polynomial part doesn't have random state if is_polynomial_pipeline(estimator): estimator = estimator.set_params(linear__random_state=self.random_state) else: @@ -407,14 +701,15 @@ def fit(self, X, y, *, sample_weight=None, groups=None): else: print(f"Processing estimator: {type(estimator).__name__}") try: - if self.random_state != None: + if self.random_state is not None: if has_random_state(model=estimator): - # For a polynomial pipeline, you have to set the random state of the linear part, the polynomial part doesn't have random state + # For a polynomial pipeline, you have to set the random state of the linear part, + # the polynomial part doesn't have random state if is_polynomial_pipeline(estimator): estimator = estimator.set_params(linear__random_state=self.random_state) else: estimator.set_params(random_state=self.random_state) - # pdb.set_trace() # Note Delete this + temp_search = GridSearchCV(estimator, param_grid, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, cv=self.cv, verbose=self.verbose, pre_dispatch=self.pre_dispatch, error_score=self.error_score, @@ -442,8 +737,10 @@ def fit(self, X, y, *, sample_weight=None, groups=None): warning_msg = f"Warning: {e} for estimator {estimator} and param_grid {param_grid}" warnings.warn(warning_msg, category=UserWarning) if not hasattr(temp_search, 'cv_results_') and not param_grid_is_empty(param_grid=param_grid): - # This warning catches a problem after fit has run with no exception, however if there is no cv_results_ this indicates a failed fit operation. - warning_msg = f"Warning: estimator {estimator} and param_grid {param_grid} failed has no attribute cv_results_." + # This warning catches a problem after fit has run with no exception, + # however if there is no cv_results_ this indicates a failed fit operation. + warning_msg = (f"Warning: estimator {estimator} and param_grid {param_grid} " + "failed, has no attribute cv_results_.") warnings.warn(warning_msg, category=FitFailedWarning) try: self.best_ind_ = np.argmax([search.best_score_ for search in self._search_list]) @@ -453,8 +750,8 @@ def fit(self, X, y, *, sample_weight=None, groups=None): self.best_estimator_ = self._search_list[self.best_ind_].best_estimator_ self.best_score_ = self._search_list[self.best_ind_].best_score_ self.best_params_ = self._search_list[self.best_ind_].best_params_ - print( - f'Best estimator {self.best_estimator_} and best score {self.best_score_} and best params {self.best_params_}') + print(f'Best estimator {self.best_estimator_} and best score {self.best_score_} ' + f'and best params {self.best_params_}') return self def scaler_transform(self, X): @@ -496,20 +793,14 @@ class GridSearchCVList(BaseEstimator): of parameter settings. """ - def __init__(self, estimator_list=['linear', 'forest'], param_grid_list='auto', scoring=None, + def __init__(self, estimator_list, param_grid_list, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', - error_score=np.nan, return_train_score=False, is_discrete=False): - # 'discrete' if is_discrete else 'continuous' - self.estimator_list = get_complete_estimator_list(estimator_list, is_discrete, ) - if param_grid_list == 'auto': - self.param_grid_list = auto_hyperparameters(estimator_list=self.estimator_list, is_discrete=is_discrete) - elif (param_grid_list == None): - self.param_grid_list = len(self.estimator_list) * [{}] - else: - self.param_grid_list = param_grid_list + error_score=np.nan, return_train_score=False): + self.estimator_list = estimator_list + self.param_grid_list = param_grid_list self.scoring = scoring self.n_jobs = n_jobs - # self.refit = refit + self.refit = refit self.cv = cv self.verbose = verbose self.pre_dispatch = pre_dispatch @@ -519,7 +810,7 @@ def __init__(self, estimator_list=['linear', 'forest'], param_grid_list='auto', def fit(self, X, y=None, **fit_params): self._gcv_list = [GridSearchCV(estimator, param_grid, scoring=self.scoring, - n_jobs=self.n_jobs, cv=self.cv, verbose=self.verbose, + n_jobs=self.n_jobs, refit=self.refit, cv=self.cv, verbose=self.verbose, pre_dispatch=self.pre_dispatch, error_score=self.error_score, return_train_score=self.return_train_score) for estimator, param_grid in zip(self.estimator_list, self.param_grid_list)] @@ -529,9 +820,6 @@ def fit(self, X, y=None, **fit_params): self.best_params_ = self._gcv_list[self.best_ind_].best_params_ return self - def best_model(self): - return self.best_estimator_ - def predict(self, X): return self.best_estimator_.predict(X) @@ -539,7 +827,7 @@ def predict_proba(self, X): return self.best_estimator_.predict_proba(X) -def _cross_val_predict(estimator, X, y=None, *, groups=None, cv=3, +def _cross_val_predict(estimator, X, y=None, *, groups=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict', safe=True): """This is a fork from :meth:`~sklearn.model_selection.cross_val_predict` to allow for diff --git a/econml/sklearn_extensions/model_selection_utils.py b/econml/sklearn_extensions/model_selection_utils.py index 477731600..ab3f567d8 100644 --- a/econml/sklearn_extensions/model_selection_utils.py +++ b/econml/sklearn_extensions/model_selection_utils.py @@ -1,5 +1,4 @@ -import pdb import warnings from sklearn.exceptions import NotFittedError import numpy as np @@ -104,7 +103,8 @@ def select_estimator(estimator_type, is_discrete, random_state): Parameters ---------- - estimator_type (str): The type of estimator to use, one of: 'linear', 'forest', 'gbf', 'nnet', 'poly', 'automl', 'all'. + estimator_type (str): The type of estimator to use, one of: 'linear', 'forest', + 'gbf', 'nnet', 'poly', 'automl', 'all'. is_discrete (bool): The type of target variable, if true then it's discrete. TODO Add Random State for parameter Returns @@ -156,7 +156,8 @@ def check_list_type(lst): bool: True if the list only contains valid objects, False otherwise. Raises: - TypeError: If the list contains objects other than strings, sklearn model objects, or sklearn model selection objects. + TypeError: If the list contains objects other than strings, sklearn model objects, + or sklearn model selection objects. Examples: >>> check_list_type(['linear', RandomForestRegressor(), KFold()]) @@ -167,13 +168,12 @@ def check_list_type(lst): if len(lst) == 0: raise ValueError("Estimator list is empty. Please add some models or use some of the defaults provided.") - # pdb.set_trace() for element in lst: if (not isinstance(element, (str, BaseCrossValidator))): if not is_likely_estimator(element): - # pdb.set_trace() raise TypeError( - f"The list must contain only strings, sklearn model objects, and sklearn model selection objects. Invalid element: {element}") + "The list must contain only strings, sklearn model objects, and sklearn model selection objects. " + f"Invalid element: {element}") return True @@ -183,7 +183,8 @@ def get_complete_estimator_list(estimator_list, is_discrete, random_state): Parameters ---------- - estimator_list : List of estimators; can be sklearn object or str: 'linear', 'forest', 'gbf', 'nnet', 'poly', 'auto', 'all'. + estimator_list : List of estimators; can be sklearn object or str: 'linear', 'forest', 'gbf', + 'nnet', 'poly', 'auto', 'all'. is_discrete (bool): if target type is discrete or continuous. Returns @@ -194,7 +195,6 @@ def get_complete_estimator_list(estimator_list, is_discrete, random_state): ValueError: If the estimator is not supported. ''' - # pdb.set_trace() if isinstance(estimator_list, str): if 'all' == estimator_list: estimator_list = ['linear', 'forest', 'gbf', 'nnet', 'poly'] @@ -204,7 +204,8 @@ def get_complete_estimator_list(estimator_list, is_discrete, random_state): estimator_list = [estimator_list] else: raise ValueError( - "Invalid estimator_list value. Please provide a valid value from the list of available estimators: ['linear', 'forest', 'gbf', 'nnet', 'poly', 'automl']") + "Invalid estimator_list value. Please provide a valid value from the list of available estimators: " + "['linear', 'forest', 'gbf', 'nnet', 'poly', 'automl']") elif isinstance(estimator_list, list): if 'auto' in estimator_list: for estimator in ['linear']: @@ -236,11 +237,10 @@ def get_complete_estimator_list(estimator_list, is_discrete, random_state): temp_est_list = flatten_list(temp_est_list) # Check that all types of models are matched towards the problem. - # pdb.set_trace() for estimator in temp_est_list: if (isinstance(estimator, BaseEstimator)): if not is_regressor_or_classifier(estimator, is_discrete=is_discrete): - raise TypeError("Invalid estimator type: {} - must be a regressor or classifier".format(type(estimator))) + raise TypeError(f"Invalid estimator type: {type(estimator)} - must be a regressor or classifier") return temp_est_list @@ -292,7 +292,9 @@ def select_classification_hyperparameters(estimator): 'linear__solver': ['saga', 'lbfgs'] } else: - warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for LogisticRegressionCV, RandomForestClassifier, MLPClassifier, and the polynomial pipleine", category=UserWarning) + warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for " + "LogisticRegressionCV, RandomForestClassifier, MLPClassifier, and the polynomial pipleine", + category=UserWarning) return {} # raise ValueError("Invalid model type. Valid values are 'linear', 'forest', 'nnet', and 'poly'.") @@ -340,7 +342,9 @@ def select_regression_hyperparameters(estimator): 'poly__degree': [2, 3, 4] } else: - warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for ElasticNetCV, RandomForestRegressor, MLPRegressor, and the polynomial pipeline.", category=UserWarning) + warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for " + "ElasticNetCV, RandomForestRegressor, MLPRegressor, and the polynomial pipeline.", + category=UserWarning) return {} @@ -490,7 +494,8 @@ def is_linear_model(estimator): """ Check if a model is a linear model. - This function checks if a model has 'fit_intercept' and 'coef_' attributes or if it is an instance of LogisticRegression, LinearSVC, or SVC. + This function checks if a model has 'fit_intercept' and 'coef_' attributes or if it is an instance of + LogisticRegression, LinearSVC, or SVC. Parameters ---------- @@ -521,7 +526,8 @@ def is_data_scaled(X): """ Check if input data is scaled. - This function checks if the input data is scaled by comparing its mean and standard deviation to 0 and 1 respectively. + This function checks if the input data is scaled by comparing its mean and standard deviation to + 0 and 1 respectively. Parameters ---------- @@ -754,7 +760,8 @@ def make_param_multi_task(estimator, param_grid): """ Convert the keys in a parameter grid to work with a multi-task model. - This function converts the keys in a parameter grid to work with a multi-task model by prepending 'estimator__' to each key. + This function converts the keys in a parameter grid to work with a multi-task model by prepending + 'estimator__' to each key. Parameters ---------- diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index afb445ccd..2f321c5f2 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -22,9 +22,7 @@ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.multioutput import MultiOutputRegressor from econml.grf import MultiOutputGRF -from econml.sklearn_extensions.model_selection import SearchEstimatorList from econml.tests.utilities import (GroupingModel, NestedModel) -import pdb try: import ray @@ -625,9 +623,9 @@ def test_access_to_internal_models(self): assert isinstance(est.featurizer_, Pipeline) assert isinstance(est.model_cate, WeightedLasso) for mdl in est.models_y[0]: - assert isinstance(mdl, SearchEstimatorList) + assert isinstance(mdl, WeightedLasso) for mdl in est.models_t[0]: - assert isinstance(mdl, SearchEstimatorList) + assert isinstance(mdl, LogisticRegression) np.testing.assert_array_equal(est.cate_feature_names(['A']), ['A', 'A^2']) np.testing.assert_array_equal(est.cate_feature_names(), ['X0', 'X0^2']) est = DML(model_y=WeightedLasso(), @@ -641,9 +639,9 @@ def test_access_to_internal_models(self): assert isinstance(est.featurizer_, FunctionTransformer) assert isinstance(est.model_cate, WeightedLasso) for mdl in est.models_y[0]: - assert isinstance(mdl, SearchEstimatorList) + assert isinstance(mdl, WeightedLasso) for mdl in est.models_t[0]: - assert isinstance(mdl, SearchEstimatorList) + assert isinstance(mdl, LogisticRegression) np.testing.assert_array_equal(est.cate_feature_names(['A']), ['A']) def test_forest_dml_perf(self): @@ -1131,7 +1129,6 @@ def _test_sparse(n_p, d_w, n_r): model_t=LinearRegression(fit_intercept=False), fit_cate_intercept=False) dml.fit(y, t, X=x, W=w) - # pdb.set_trace() np.testing.assert_allclose(a, dml.coef_.reshape(-1), atol=1e-1) eff = reshape(t * np.choose(np.tile(p, 2), a), (-1,)) np.testing.assert_allclose(eff, dml.effect(x, T0=0, T1=t), atol=1e-1) @@ -1239,8 +1236,8 @@ def test_groups(self): # test outer grouping # with 2 folds, we should get exactly 3 groups per split, each with 10 copies of the y or t value - est = LinearDML(model_y=GroupingModel(LinearRegression(), (3, 3), n_copies), - model_t=GroupingModel(LinearRegression(), (3, 3), n_copies)) + est = LinearDML(model_y=GroupingModel(LinearRegression(), 60, (3, 3), n_copies), + model_t=GroupingModel(LinearRegression(), 60, (3, 3), n_copies)) est.fit(y, t, groups=groups) # test nested grouping @@ -1248,17 +1245,10 @@ def test_groups(self): # with 2-fold outer and 2-fold inner grouping, and six total groups, # should get 1 or 2 groups per split - est = LinearDML(model_y=NestedModel(LassoCV(cv=2), (1, 2), n_copies), - model_t=NestedModel(LassoCV(cv=2), (1, 2), n_copies)) + est = LinearDML(model_y=NestedModel(LassoCV(cv=2), 60, (1, 2), n_copies), + model_t=NestedModel(LassoCV(cv=2), 60, (1, 2), n_copies)) est.fit(y, t, groups=groups) - # by default, we use 5 split cross-validation for our T and Y models - # but we don't have enough groups here to split both the outer and inner samples with grouping - # TODO: does this imply we should change some defaults to make this more likely to succeed? - est = LinearDML(model_y=LassoCV(cv=5), model_t=LassoCV(cv=5)) - with pytest.raises(Exception): - est.fit(y, t, groups=groups) - def test_treatment_names(self): Y = np.random.normal(size=(100, 1)) T = np.random.binomial(n=1, p=0.5, size=(100, 1)) diff --git a/econml/tests/test_dmliv.py b/econml/tests/test_dmliv.py index f52c14356..16f8f55a9 100644 --- a/econml/tests/test_dmliv.py +++ b/econml/tests/test_dmliv.py @@ -207,7 +207,7 @@ def test_groups(self): projection=False, discrete_treatment=True, discrete_instrument=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims, n_copies), model_t_xw=LogisticRegression(), model_z_xw=LogisticRegression(), ), @@ -215,7 +215,7 @@ def test_groups(self): projection=True, discrete_treatment=True, discrete_instrument=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims, n_copies), model_t_xw=LogisticRegression(), model_t_xwz=LogisticRegression(), ), @@ -223,7 +223,7 @@ def test_groups(self): model_final=LinearRegression(fit_intercept=False), discrete_treatment=True, discrete_instrument=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims, n_copies), model_t_xw=LogisticRegression(), model_t_xwz=LogisticRegression(), ), @@ -231,7 +231,7 @@ def test_groups(self): model_final=RandomForestRegressor(), discrete_treatment=True, discrete_instrument=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims, n_copies), model_t_xw=LogisticRegression(), model_t_xwz=LogisticRegression(), ), diff --git a/econml/tests/test_driv.py b/econml/tests/test_driv.py index 39b90c1ed..38bb8421a 100644 --- a/econml/tests/test_driv.py +++ b/econml/tests/test_driv.py @@ -13,7 +13,7 @@ import pickle from scipy import special from sklearn.preprocessing import PolynomialFeatures -from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegression import unittest try: @@ -281,7 +281,10 @@ def test_accuracy_without_ray(self): def test_fit_cov_directly(self): # fitting the covariance directly should be at least as good as computing the covariance from separate models - est = LinearDRIV() + + # set the models so that model selection over random forests doesn't take too much time in the repeated trials + est = LinearDRIV(model_y_xw=LassoCV(), model_t_xw=LassoCV(), model_z_xw=LassoCV(), + model_tz_xw=LassoCV()) n = 500 p = 10 @@ -334,8 +337,8 @@ def ceil(a, b): # ceiling analog of // DRIV( discrete_instrument=True, discrete_treatment=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims_2, n_copies), - model_z_xw=LinearRegression(), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_2, n_copies), + model_z_xw=LogisticRegression(), model_t_xw=LogisticRegression(), model_tz_xw=LinearRegression(), model_t_xwz=LogisticRegression(), @@ -344,8 +347,8 @@ def ceil(a, b): # ceiling analog of // LinearDRIV( discrete_instrument=True, discrete_treatment=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims_2, n_copies), - model_z_xw=LinearRegression(), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_2, n_copies), + model_z_xw=LogisticRegression(), model_t_xw=LogisticRegression(), model_tz_xw=LinearRegression(), model_t_xwz=LogisticRegression(), @@ -354,8 +357,8 @@ def ceil(a, b): # ceiling analog of // SparseLinearDRIV( discrete_instrument=True, discrete_treatment=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims_2, n_copies), - model_z_xw=LinearRegression(), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_2, n_copies), + model_z_xw=LogisticRegression(), model_t_xw=LogisticRegression(), model_tz_xw=LinearRegression(), model_t_xwz=LogisticRegression(), @@ -364,20 +367,20 @@ def ceil(a, b): # ceiling analog of // ForestDRIV( discrete_instrument=True, discrete_treatment=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims_2, n_copies), - model_z_xw=LinearRegression(), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_2, n_copies), + model_z_xw=LogisticRegression(), model_t_xw=LogisticRegression(), model_tz_xw=LinearRegression(), model_t_xwz=LogisticRegression(), prel_cate_approach='dmliv' ), IntentToTreatDRIV( - model_y_xw=GroupingModel(LinearRegression(), ct_lims_3, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_3, n_copies), model_t_xwz=LogisticRegression(), prel_cate_approach='dmliv' ), LinearIntentToTreatDRIV( - model_y_xw=GroupingModel(LinearRegression(), ct_lims_3, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_3, n_copies), model_t_xwz=LogisticRegression(), prel_cate_approach='dmliv' ) diff --git a/econml/tests/test_drlearner.py b/econml/tests/test_drlearner.py index f6a5e4ae8..3d3e982a9 100644 --- a/econml/tests/test_drlearner.py +++ b/econml/tests/test_drlearner.py @@ -828,26 +828,17 @@ def test_groups(self): # cross-fit generate one est = LinearDRLearner(model_propensity=LogisticRegression(), # with 2-fold grouping, we should get exactly 3 groups per split - model_regression=GroupingModel(LinearRegression(), (3, 3), n_copies), + model_regression=GroupingModel(LinearRegression(), 60, (3, 3), n_copies), cv=StratifiedGroupKFold(2)) est.fit(y, t, W=w, groups=groups) # test nested grouping est = LinearDRLearner(model_propensity=LogisticRegression(), # with 2-fold outer and 2-fold inner grouping, we should get 1-2 groups per split - model_regression=NestedModel(LassoCV(cv=2), (1, 2), n_copies), + model_regression=NestedModel(LassoCV(cv=2), 60, (1, 2), n_copies), cv=StratifiedGroupKFold(2)) est.fit(y, t, W=w, groups=groups) - # by default, we use 5 split cross-validation for our T and Y models - # but we don't have enough groups here to split both the outer and inner samples with grouping - # TODO: does this imply we should change some defaults to make this more likely to succeed? - est = LinearDRLearner(model_propensity=LogisticRegressionCV(cv=5), - model_regression=LassoCV(cv=5), - cv=StratifiedGroupKFold(2)) - with pytest.raises(Exception): - est.fit(y, t, W=w, groups=groups) - def test_score(self): """Test that scores are the same no matter whether the prediction of cate model has the same shape of input or the shape of input.reshape(-1,1).""" diff --git a/econml/tests/test_missing_values.py b/econml/tests/test_missing_values.py index 66d917f76..eb1c4f7e4 100644 --- a/econml/tests/test_missing_values.py +++ b/econml/tests/test_missing_values.py @@ -27,7 +27,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self diff --git a/econml/tests/test_ortho_learner.py b/econml/tests/test_ortho_learner.py index 66c389ae0..b22a9dbcc 100644 --- a/econml/tests/test_ortho_learner.py +++ b/econml/tests/test_ortho_learner.py @@ -29,7 +29,7 @@ class Wrapper: def __init__(self, model): self._model = model - def fit(self, X, y, Q, W=None): + def train(self, is_selecting, X, y, Q, W=None): self._model.fit(X, y) return self @@ -109,7 +109,7 @@ class Wrapper: def __init__(self, model): self._model = model - def fit(self, X, y, W=None): + def train(self, is_selecting, X, y, W=None): self._model.fit(X, y) return self @@ -219,7 +219,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self @@ -331,7 +331,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self @@ -378,7 +378,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self @@ -434,7 +434,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, np.matmul(T, np.arange(1, T.shape[1] + 1))) self._model_y.fit(W, Y) return self diff --git a/econml/tests/utilities.py b/econml/tests/utilities.py index 4c04cc89d..1c11be343 100644 --- a/econml/tests/utilities.py +++ b/econml/tests/utilities.py @@ -16,15 +16,17 @@ class GroupingModel: and the number of copies of each y value should be equal to the group size """ - def __init__(self, model, limits, n_copies): + def __init__(self, model, total, limits, n_copies): self.model = model + self.total = total self.limits = limits self.n_copies = n_copies - def validate(self, y): + def validate(self, y, skip_group_counts=False): (yvals, cts) = np.unique(y, return_counts=True) (llim, ulim) = self.limits - if not (llim <= len(yvals) <= ulim): + # if we aren't fitting on the whole dataset, ensure that the limits are respected + if (not skip_group_counts) and (not (llim <= len(yvals) <= ulim)): raise Exception(f"Grouping failed: received {len(yvals)} groups instead of {llim}-{ulim}") # ensure that the grouping has worked correctly and we get exactly the number of copies @@ -35,7 +37,7 @@ def validate(self, y): f"Grouping failed; received {ct} copies of {yval} instead of {self.n_copies[yval]}") def fit(self, X, y): - self.validate(y) + self.validate(y, len(y) == self.total) self.model.fit(X, y) return self @@ -46,12 +48,9 @@ def predict(self, X): class NestedModel(GroupingModel): """ Class for testing nested grouping. The wrapped model must have a 'cv' attribute; - this class exposes an identical 'cv' attribute, which is how nested CV is implemented in fit_with_groups + this class exposes an identical 'cv' attribute, which is how nested CV is implemented in _fit_with_groups """ - def __init__(self, model, limits, n_copies): - super().__init__(model, limits, n_copies) - # DML nested CV works via a 'cv' attribute @property def cv(self): @@ -64,6 +63,6 @@ def cv(self, value): def fit(self, X, y): for (train, test) in check_cv(self.cv, y).split(X, y): # want to validate the nested grouping, not the outer grouping in the nesting tests - self.validate(y[train]) + self.validate(y[train], len(y) == self.total) self.model.fit(X, y) return self diff --git a/econml/utilities.py b/econml/utilities.py index 008bfc244..f62ffbb4d 100644 --- a/econml/utilities.py +++ b/econml/utilities.py @@ -21,7 +21,6 @@ from sklearn.preprocessing import PolynomialFeatures import warnings from warnings import warn -from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold from collections.abc import Iterable from sklearn.utils.multiclass import type_of_target import numbers @@ -30,7 +29,6 @@ from statsmodels.compat.python import lmap import copy from inspect import signature -from econml.sklearn_extensions.model_selection import SearchEstimatorList MAX_RAND_SEED = np.iinfo(np.int32).max @@ -920,78 +918,6 @@ def filter_inds(coords, data, n): [arrs[indMap[c][0][0]].shape[indMap[c][0][1]] for c in outputs]) -def fit_with_groups(model, X, y, groups=None, **kwargs): - """ - Fit a model while correctly handling grouping if necessary. - - This enables us to perform an inner-loop cross-validation of a model - which handles grouping correctly, which is not easy using typical sklearn models. - - For example, GridSearchCV and RandomSearchCV both support passing 'groups' to fit, - but other CV-related estimators (such as those derived from LinearModelCV, including LassoCV), - do not support passing groups to fit which meanst that GroupKFold cannot be used as the cv instance - when using these types, because the required 'groups' argument will never be passed to the - GroupKFold's split method. See also https://github.com/scikit-learn/scikit-learn/issues/12052 - - The (hacky) workaround that is used here is to explicitly set the 'cv' attribute (if there is one) to - the exact set of rows and not to use GroupKFold even with the sklearn classes that could support it; - this should work with classes derived from BaseSearchCV, LinearModelCV, and CalibratedClassifierCV. - - Parameters - ---------- - model : estimator - The model to fit - X : array_like - The features to fit against - y : array_like - The target to fit against - groups : array_like, optional - The set of groupings that should be kept together when splitting rows for - cross-validation - kwargs : dict - Any other named arguments to pass to the model's fit - """ - # import pdb - # pdb.set_trace() - if groups is not None: - if isinstance(model, SearchEstimatorList): - # SearchEstimatorList must be handled different. Each estimator must be changed for CV else the functionality isn't the same - # It does have a CV but it does not work if you just change the CV of the SearchEstimatorList - for estimator in model.complete_estimator_list: - if hasattr(estimator, 'cv'): - old_cv = estimator.cv - cv = 5 if old_cv is None else old_cv - if isinstance(cv, numbers.Integral): - cv = GroupKFold(cv) - splits = list(cv.split(X, y, groups=groups)) - try: - estimator.cv = splits - except: - estimator.cv = old_cv - # assume that we should perform nested cross-validation if and only if - # the model has a 'cv' attribute; this is a somewhat brittle assumption... - elif hasattr(model, 'cv'): - old_cv = model.cv - # logic copied from check_cv - cv = 5 if old_cv is None else old_cv - if isinstance(cv, numbers.Integral): - cv = GroupKFold(cv) - # otherwise we will assume the user already set the cv attribute to something - # compatible with splitting with a 'groups' argument - - # now we have to compute the folds explicitly because some classifiers (like LassoCV) - # don't use the groups when calling split internally - splits = list(cv.split(X, y, groups=groups)) - try: - print(splits) - model.cv = splits - return model.fit(X, y, **kwargs) - finally: - model.cv = old_cv - - return model.fit(X, y, **kwargs) - - def filter_none_kwargs(**kwargs): """ Filters out any keyword arguments that are None. diff --git a/notebooks/SearchEstimatorList functionality.ipynb b/notebooks/SearchEstimatorList functionality.ipynb deleted file mode 100644 index 4464199de..000000000 --- a/notebooks/SearchEstimatorList functionality.ipynb +++ /dev/null @@ -1,1031 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Import necessary packages\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import mean_squared_error, accuracy_score\n", - "from sklearn.datasets import load_iris\n", - "from econml.sklearn_extensions.model_selection import SearchEstimatorList\n", - "import warnings\n", - "import numpy as np\n", - "from econml.dml import LinearDML, CausalForestDML\n", - "from econml.cate_interpreter import SingleTreeCateInterpreter, SingleTreePolicyInterpreter\n", - "import pandas as pd\n", - "from sklearn.preprocessing import PolynomialFeatures\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.exceptions import ConvergenceWarning\n", - "\n", - "# Ignore the ConvergenceWarning\n", - "warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n", - "\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# SearchEstimatorList\n", - "\n", - "The SearchEstimatorList class is a custom Python class designed to streamline the process of training multiple machine learning models and tuning their hyperparameters. This class can be especially useful when you're unsure which model will perform best on your data and you want to compare several of them.\n", - "\n", - "# Key Features\n", - "\n", - " Multiple Model Training: The SearchEstimatorList class takes a list of Scikit-learn estimators (machine learning models) and trains each of them on your data.\n", - "\n", - " Hyperparameter Tuning: For each model, the class conducts a grid search over a provided range of hyperparameters. This allows you to automatically find the hyperparameters that result in the best model performance.\n", - "\n", - " Model Evaluation: The class retains the best performing model based on a specified scoring metric. This makes it easy to determine which model and hyperparameters are the most suitable for your data.\n", - "\n", - " Data Scaling: The SearchEstimatorList class also supports data scaling, which can be important for certain types of models.\n", - "\n", - " Handling of Different Target Types: This class handles both continuous and discrete target variables, making it suitable for both regression and classification tasks.\n", - "\n", - "# Usage\n", - "\n", - "To use the SearchEstimatorList class, you start by initializing an instance of the class with a list of models and their corresponding hyperparameter grids. Then, you call the fit method to train the models and conduct the grid search. After fitting, you can use the predict method to generate predictions for new data. The class also has methods to refit the best model using the entire dataset (refit) and to return the best model (best_model)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Classifier" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "No scoring value was given. Using default score method f1_macro.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 2 folds for each of 3 candidates, totalling 6 fits\n", - "[CV 1/2] END ...................n_estimators=50;, score=0.916 total time= 0.1s\n", - "[CV 2/2] END ...................n_estimators=50;, score=0.950 total time= 0.1s\n", - "[CV 1/2] END ..................n_estimators=100;, score=0.916 total time= 0.1s\n", - "[CV 2/2] END ..................n_estimators=100;, score=0.950 total time= 0.1s\n", - "[CV 1/2] END ..................n_estimators=150;, score=0.916 total time= 0.1s\n", - "[CV 2/2] END ..................n_estimators=150;, score=0.950 total time= 0.1s\n", - "Fitting 2 folds for each of 9 candidates, totalling 18 fits\n", - "[CV 1/2] END learning_rate=0.01, n_estimators=50;, score=0.900 total time= 0.0s\n", - "[CV 2/2] END learning_rate=0.01, n_estimators=50;, score=0.950 total time= 0.0s\n", - "[CV 1/2] END learning_rate=0.01, n_estimators=100;, score=0.900 total time= 0.0s\n", - "[CV 2/2] END learning_rate=0.01, n_estimators=100;, score=0.950 total time= 0.1s\n", - "[CV 1/2] END learning_rate=0.01, n_estimators=150;, score=0.900 total time= 0.1s\n", - "[CV 2/2] END learning_rate=0.01, n_estimators=150;, score=0.950 total time= 0.1s\n", - "[CV 1/2] END learning_rate=0.1, n_estimators=50;, score=0.900 total time= 0.0s\n", - "[CV 2/2] END learning_rate=0.1, n_estimators=50;, score=0.950 total time= 0.0s\n", - "[CV 1/2] END learning_rate=0.1, n_estimators=100;, score=0.900 total time= 0.1s\n", - "[CV 2/2] END learning_rate=0.1, n_estimators=100;, score=0.933 total time= 0.1s\n", - "[CV 1/2] END learning_rate=0.1, n_estimators=150;, score=0.900 total time= 0.1s\n", - "[CV 2/2] END learning_rate=0.1, n_estimators=150;, score=0.933 total time= 0.1s\n", - "[CV 1/2] END ..learning_rate=1, n_estimators=50;, score=0.900 total time= 0.0s\n", - "[CV 2/2] END ..learning_rate=1, n_estimators=50;, score=0.933 total time= 0.0s\n", - "[CV 1/2] END .learning_rate=1, n_estimators=100;, score=0.900 total time= 0.1s\n", - "[CV 2/2] END .learning_rate=1, n_estimators=100;, score=0.933 total time= 0.1s\n", - "[CV 1/2] END .learning_rate=1, n_estimators=150;, score=0.900 total time= 0.1s\n", - "[CV 2/2] END .learning_rate=1, n_estimators=150;, score=0.933 total time= 0.1s\n", - "Best estimator RandomForestClassifier(n_estimators=50) and best score 0.9330819977445048 and best params {'n_estimators': 50}\n", - "Accuracy: 1.0\n" - ] - } - ], - "source": [ - "# Load the Iris dataset for classification\n", - "iris = load_iris()\n", - "\n", - "# Split the dataset into training and test sets\n", - "X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(\n", - " iris.data, iris.target, test_size=0.2, random_state=42\n", - ")\n", - "\n", - "# Define models and their parameter grids\n", - "estimator_list_cls = ['forest', 'gbf']\n", - "param_grid_list_cls = [{'n_estimators': [50, 100, 150]}, {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 1]}]\n", - "\n", - "# Initialize SearchEstimatorList\n", - "sel_cls = SearchEstimatorList(\n", - " estimator_list=estimator_list_cls, \n", - " param_grid_list=param_grid_list_cls, \n", - " is_discrete=True,\n", - " verbose=3\n", - ")\n", - "\n", - "# Fit the model to the training data\n", - "sel_cls.fit(X_train_cls, y_train_cls)\n", - "\n", - "# Predict outcomes for the test set\n", - "predictions_cls = sel_cls.predict(X_test_cls)\n", - "\n", - "# Evaluate the model\n", - "acc = accuracy_score(y_test_cls, predictions_cls)\n", - "\n", - "# Print the evaluation metric\n", - "print(f\"Accuracy: {acc}\")\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Regressor" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 2 folds for each of 7 candidates, totalling 14 fits\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/anthonycampbell/Documents/EconML-CS696DS/econml/sklearn_extensions/model_selection.py:346: UserWarning: No scoring value was given. Using default score method neg_mean_squared_error.\n", - " warnings.warn(f\"No scoring value was given. Using default score method {self.scoring}.\")\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV 1/2] END .....................l1_ratio=0.1;, score=-0.584 total time= 0.0s\n", - "[CV 2/2] END .....................l1_ratio=0.1;, score=-0.725 total time= 0.0s\n", - "[CV 1/2] END .....................l1_ratio=0.5;, score=-0.549 total time= 0.0s\n", - "[CV 2/2] END .....................l1_ratio=0.5;, score=-0.675 total time= 0.0s\n", - "[CV 1/2] END .....................l1_ratio=0.7;, score=-0.546 total time= 0.0s\n", - "[CV 2/2] END .....................l1_ratio=0.7;, score=-0.668 total time= 0.0s\n", - "[CV 1/2] END .....................l1_ratio=0.9;, score=-0.544 total time= 0.0s\n", - "[CV 2/2] END .....................l1_ratio=0.9;, score=-0.663 total time= 0.0s\n", - "[CV 1/2] END ....................l1_ratio=0.95;, score=-0.544 total time= 0.0s\n", - "[CV 2/2] END ....................l1_ratio=0.95;, score=-0.662 total time= 0.0s\n", - "[CV 1/2] END ....................l1_ratio=0.99;, score=-0.544 total time= 0.0s\n", - "[CV 2/2] END ....................l1_ratio=0.99;, score=-0.661 total time= 0.0s\n", - "[CV 1/2] END .......................l1_ratio=1;, score=-0.544 total time= 0.0s\n", - "[CV 2/2] END .......................l1_ratio=1;, score=-0.661 total time= 0.0s\n", - "Fitting 2 folds for each of 3 candidates, totalling 6 fits\n", - "[CV 1/2] END ............hidden_layer_sizes=50;, score=-0.712 total time= 1.0s\n", - "[CV 2/2] END ............hidden_layer_sizes=50;, score=-0.580 total time= 1.3s\n", - "[CV 1/2] END ...........hidden_layer_sizes=100;, score=-0.695 total time= 0.8s\n", - "[CV 2/2] END ...........hidden_layer_sizes=100;, score=-2.334 total time= 1.0s\n", - "[CV 1/2] END ...........hidden_layer_sizes=200;, score=-0.641 total time= 8.1s\n", - "[CV 2/2] END ...........hidden_layer_sizes=200;, score=-1.162 total time= 5.4s\n", - "Best estimator ElasticNetCV(l1_ratio=1) and best score -0.6025662427788023 and best params {'l1_ratio': 1}\n", - "Mean Squared Error: 0.5555752649052167\n" - ] - } - ], - "source": [ - "# Import necessary packages\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import mean_squared_error, accuracy_score\n", - "from sklearn.datasets import fetch_california_housing\n", - "from econml.sklearn_extensions.model_selection import SearchEstimatorList\n", - "\n", - "# Load the Boston Housing dataset for regression\n", - "california_housing = fetch_california_housing()\n", - "\n", - "# Split the dataset into training and test sets\n", - "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(\n", - " california_housing.data, california_housing.target, test_size=0.2, random_state=42\n", - ")\n", - "\n", - "# Define models and their parameter grids\n", - "# This will use ElasticNet because it's a Linear Model and a Neural Network Regressor\n", - "estimator_list_reg = ['linear', 'nnet']\n", - "param_grid_list_reg = [{'l1_ratio': [.1, .5, .7, .9, .95, .99, 1]}, {'hidden_layer_sizes': [50, 100, 200]}]\n", - "\n", - "# Initialize SearchEstimatorList\n", - "sel_reg = SearchEstimatorList(\n", - " estimator_list=estimator_list_reg, \n", - " param_grid_list=param_grid_list_reg,\n", - " is_discrete=False,\n", - " verbose=3\n", - ")\n", - "\n", - "# Fit the model to the training data\n", - "sel_reg.fit(X_train_reg, y_train_reg)\n", - "\n", - "# Predict outcomes for the test set\n", - "predictions_reg = sel_reg.predict(X_test_reg)\n", - "\n", - "# Evaluate the model\n", - "mse = mean_squared_error(y_test_reg, predictions_reg)\n", - "\n", - "# Print the evaluation metric\n", - "print(f\"Mean Squared Error: {mse}\")\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Using all estimators" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/anthonycampbell/Documents/EconML-CS696DS/econml/sklearn_extensions/model_selection.py:346: UserWarning: No scoring value was given. Using default score method f1_macro.\n", - " warnings.warn(f\"No scoring value was given. Using default score method {self.scoring}.\")\n" - ] - } - ], - "source": [ - "search = SearchEstimatorList(estimator_list = ['linear', 'forest', 'gbf', 'nnet', 'poly'], is_discrete=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Single Estimators and Model Objects" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best estimator LogisticRegression(C=0.001, max_iter=50, penalty='none', solver='sag') and best score 0.966624895572264 and best params {'C': 0.001, 'max_iter': 50, 'penalty': 'none', 'solver': 'sag'}\n", - "LogisticRegression(C=0.001, max_iter=50, penalty='none', solver='sag')\n", - "{'C': 0.001, 'max_iter': 50, 'penalty': 'none', 'solver': 'sag'}\n", - "mse of test dataset: 0.0\n", - "[[7.30818687e-04 9.18278306e-01 8.09908750e-02]\n", - " [9.96517769e-01 3.48223146e-03 9.52705844e-13]\n", - " [8.11833119e-11 2.27064968e-04 9.99772935e-01]\n", - " [1.49082115e-03 8.82474441e-01 1.16034738e-01]\n", - " [6.61814371e-04 9.57060549e-01 4.22776371e-02]\n", - " [9.94291457e-01 5.70854348e-03 8.51181731e-12]\n", - " [3.09570872e-02 9.66175329e-01 2.86758338e-03]\n", - " [1.03620286e-04 2.72711857e-01 7.27184523e-01]\n", - " [1.86273814e-04 5.89659675e-01 4.10154051e-01]\n", - " [7.89829063e-03 9.84383361e-01 7.71834853e-03]\n", - " [1.79967697e-04 3.80342060e-01 6.19477972e-01]\n", - " [9.87625715e-01 1.23742845e-02 6.37903013e-11]\n", - " [9.97989545e-01 2.01045508e-03 2.71212460e-13]\n", - " [9.87073806e-01 1.29261936e-02 5.68033322e-11]\n", - " [9.97732149e-01 2.26785067e-03 1.43489213e-12]\n", - " [2.40047637e-03 9.42313621e-01 5.52859030e-02]\n", - " [1.40979957e-07 5.60447914e-03 9.94395380e-01]\n", - " [4.57991768e-03 9.78714479e-01 1.67056034e-02]\n", - " [1.07687184e-03 8.47974601e-01 1.50948527e-01]\n", - " [1.55738075e-07 5.44482660e-03 9.94555018e-01]\n", - " [9.84143440e-01 1.58565593e-02 2.21243624e-10]\n", - " [1.96353775e-04 3.77725182e-01 6.22078464e-01]\n", - " [9.90664487e-01 9.33551321e-03 6.98033897e-11]\n", - " [2.52736850e-07 8.46501225e-03 9.91534735e-01]\n", - " [1.95677109e-05 4.08891407e-01 5.91089025e-01]\n", - " [1.72461836e-05 8.83781623e-02 9.11604592e-01]\n", - " [1.09118029e-07 1.18285926e-02 9.88171298e-01]\n", - " [3.31801168e-07 1.03342423e-02 9.89665426e-01]\n", - " [9.86532115e-01 1.34678849e-02 1.68835118e-10]\n", - " [9.80493031e-01 1.95069688e-02 2.80655184e-10]]\n" - ] - } - ], - "source": [ - "with warnings.catch_warnings():\n", - " warnings.simplefilter(\"ignore\")\n", - "\n", - " from sklearn.linear_model import LogisticRegression\n", - " lr_param_grid = {\n", - " 'penalty': ['l1', 'l2', 'elasticnet', 'none'],\n", - " 'C': [0.001, 0.01, 0.1, 1, 10, 100],\n", - " 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],\n", - " 'max_iter': [50, 100, 200, 500],\n", - " }\n", - "\n", - " search = SearchEstimatorList(estimator_list = LogisticRegression(), param_grid_list= lr_param_grid, verbose=0, is_discrete=True)\n", - " search.fit(X_train_cls, y_train_cls)\n", - " print(search.best_model())\n", - " print(search.best_params_)\n", - " y_pred = search.predict(X_test_cls)\n", - "\n", - " mse = mean_squared_error(y_test_cls, y_pred)\n", - "\n", - "print(\"mse of test dataset:\", mse,)\n", - "print(search.predict_proba(X_test_cls))\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Polynomial Feature\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 2 folds for each of 9 candidates, totalling 18 fits\n", - "[CV 1/2] END linear__l1_ratio=0.1, poly__degree=2;, score=0.322 total time= 0.3s\n", - "[CV 2/2] END linear__l1_ratio=0.1, poly__degree=2;, score=0.287 total time= 0.2s\n", - "[CV 1/2] END linear__l1_ratio=0.1, poly__degree=3;, score=0.000 total time= 0.3s\n", - "[CV 2/2] END linear__l1_ratio=0.1, poly__degree=3;, score=0.014 total time= 0.3s\n", - "[CV 1/2] END linear__l1_ratio=0.1, poly__degree=4;, score=0.000 total time= 1.0s\n", - "[CV 2/2] END linear__l1_ratio=0.1, poly__degree=4;, score=-0.000 total time= 1.1s\n", - "[CV 1/2] END linear__l1_ratio=0.5, poly__degree=2;, score=0.322 total time= 0.3s\n", - "[CV 2/2] END linear__l1_ratio=0.5, poly__degree=2;, score=0.287 total time= 0.2s\n", - "[CV 1/2] END linear__l1_ratio=0.5, poly__degree=3;, score=0.000 total time= 0.3s\n", - "[CV 2/2] END linear__l1_ratio=0.5, poly__degree=3;, score=0.014 total time= 0.4s\n", - "[CV 1/2] END linear__l1_ratio=0.5, poly__degree=4;, score=0.000 total time= 1.5s\n", - "[CV 2/2] END linear__l1_ratio=0.5, poly__degree=4;, score=-0.000 total time= 1.3s\n", - "[CV 1/2] END linear__l1_ratio=0.9, poly__degree=2;, score=0.322 total time= 0.2s\n", - "[CV 2/2] END linear__l1_ratio=0.9, poly__degree=2;, score=0.287 total time= 0.2s\n", - "[CV 1/2] END linear__l1_ratio=0.9, poly__degree=3;, score=0.000 total time= 0.3s\n", - "[CV 2/2] END linear__l1_ratio=0.9, poly__degree=3;, score=0.014 total time= 0.4s\n", - "[CV 1/2] END linear__l1_ratio=0.9, poly__degree=4;, score=0.000 total time= 1.1s\n", - "[CV 2/2] END linear__l1_ratio=0.9, poly__degree=4;, score=-0.000 total time= 1.1s\n", - "Best estimator Pipeline(steps=[('poly', PolynomialFeatures()),\n", - " ('linear', ElasticNetCV(l1_ratio=0.9))]) and best score 0.30443941337924607 and best params {'linear__l1_ratio': 0.9, 'poly__degree': 2}\n", - "Mean Squared Error: 0.8894038237145269\n" - ] - } - ], - "source": [ - "with warnings.catch_warnings():\n", - " warnings.simplefilter(\"ignore\")\n", - " # For polynomial, please ensure that you have \"poly__\" (two \"_\" or underscores after poly) underneath to change degree\n", - " # To change the linear method please add \"linear__\" (two \"_\" or underscores after linear)\n", - " param_grid_list_poly = {'poly__degree': [2, 3, 4], 'linear__l1_ratio': [0.1, 0.5, 0.9]}\n", - " sel_reg = SearchEstimatorList(\n", - " estimator_list='poly', \n", - " param_grid_list=param_grid_list_poly,\n", - " is_discrete=False,\n", - " scoring='explained_variance',\n", - " verbose=3\n", - " )\n", - "\n", - " # Fit the model to the training data\n", - " sel_reg.fit(X_train_reg, y_train_reg)\n", - "\n", - " # Predict outcomes for the test set\n", - " predictions_reg = sel_reg.predict(X_test_reg)\n", - "\n", - " # Evaluate the model\n", - " mse = mean_squared_error(y_test_reg, predictions_reg)\n", - "\n", - " # Print the evaluation metric\n", - " print(f\"Mean Squared Error: {mse}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['linear', 'forest', 'gbf', 'nnet', 'poly']\n" - ] - } - ], - "source": [ - "# These are all of the supported models that we have that have built in hyper parameters already included\n", - "print(sel_reg.supported_models)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-0.518 total time= 0.1s\n", - "[CV 2/2] END .................................., score=-0.552 total time= 0.0s\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-0.287 total time= 1.3s\n", - "[CV 2/2] END .................................., score=-0.293 total time= 1.3s\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-0.286 total time= 3.1s\n", - "[CV 2/2] END .................................., score=-0.274 total time= 3.1s\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-0.305 total time= 3.2s\n", - "[CV 2/2] END .................................., score=-0.305 total time= 3.0s\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-0.526 total time= 0.6s\n", - "[CV 2/2] END ................................., score=-12.077 total time= 0.5s\n", - "Best estimator RandomForestRegressor() and best score -0.27976201134927425 and best params {}\n", - "Mean Squared Error: 0.2508316133481009\n" - ] - } - ], - "source": [ - "# To try every type of model simply use the \"all\" option\n", - "with warnings.catch_warnings():\n", - " warnings.simplefilter(\"ignore\")\n", - " sel_reg = SearchEstimatorList(\n", - " estimator_list='all', \n", - " param_grid_list=None,\n", - " is_discrete=False,\n", - " scaling=True,\n", - " verbose=5\n", - " )\n", - "\n", - " # Fit the model to the training data\n", - " sel_reg.fit(X_train_reg, y_train_reg)\n", - "\n", - " # Predict outcomes for the test set\n", - " predictions_reg = sel_reg.predict(X_test_reg)\n", - "\n", - " # Evaluate the model\n", - " mse = mean_squared_error(y_test_reg, predictions_reg)\n", - "\n", - " # Print the evaluation metric\n", - " print(f\"Mean Squared Error: {mse}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Scoring functions\n", - "\n", - "Using a custom scoring function. See https://scikit-learn.org/stable/modules/model_evaluation.html for how to make your own scoring metric\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-0.741 total time= 0.0s\n", - "[CV 2/2] END .................................., score=-0.822 total time= 0.0s\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV 1/2] END .................................., score=-2.404 total time= 0.8s\n", - "[CV 2/2] END .................................., score=-1.671 total time= 0.8s\n", - "Best estimator ElasticNetCV() and best score -0.7813657065847333 and best params {}\n", - "Root Mean Squared Error: 0.7490149943228499\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.metrics import mean_squared_error\n", - "from sklearn.metrics import make_scorer\n", - "\n", - "def root_mean_squared_error(y_true, y_pred):\n", - " mse = mean_squared_error(y_true, y_pred)\n", - " rmse = np.sqrt(mse)\n", - " return rmse\n", - "loss_function = make_scorer(root_mean_squared_error, greater_is_better=False)\n", - "\n", - "sel_reg = SearchEstimatorList(\n", - " estimator_list=estimator_list_reg, \n", - " param_grid_list=None,\n", - " is_discrete=False,\n", - " scoring=loss_function,\n", - " verbose=3\n", - ")\n", - "\n", - "# Fit the model to the training data\n", - "sel_reg.fit(X_train_reg, y_train_reg)\n", - "\n", - "# Predict outcomes for the test set\n", - "predictions_reg = sel_reg.predict(X_test_reg)\n", - "\n", - "# Evaluate the model\n", - "rmse = root_mean_squared_error(y_test_reg, predictions_reg)\n", - "\n", - "# Print the evaluation metric\n", - "print(f\"Root Mean Squared Error: {rmse}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# What this means for EconML?\n", - "\n", - "By integrating the SearchEstimatorList into econml, we can gain a number of benefits in these categories:\n", - "\n", - " Model Selection: econml contains many different models, each with its own assumptions and use cases. By using SearchEstimatorList, you can more easily compare the performance of different models on your data and select the best one.\n", - "\n", - " Hyperparameter Tuning: Many of the models in econml have hyperparameters that need to be tuned for optimal performance. SearchEstimatorList can automate this process by performing a grid search over specified hyperparameters for each model.\n", - "\n", - " Efficiency: Instead of having to manually train each model and tune its hyperparameters, SearchEstimatorList can do this all at once. This can save a significant amount of time and make the model building process more efficient.\n", - "\n", - "See the example below with data taken fromt he Customer Segmentation at an Online Media Company Notebook" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "No scoring value was given. Using default score method neg_mean_squared_error.\n", - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "*** Causal Estimate ***\n", - "\n", - "## Identified estimand\n", - "Estimand type: nonparametric-ate\n", - "\n", - "### Estimand : 1\n", - "Estimand name: backdoor\n", - "Estimand expression:\n", - " d \n", - "────────────(E[log_demand|income,friends_count,days_⟨visited,⟩_hours,age,songs\n", - "d[log_price] \n", - "\n", - " \n", - "_purchased,has_membership,is_US,account_age])\n", - " \n", - "Estimand assumption 1, Unconfoundedness: If U→{log_price} and U→log_demand then P(log_demand|log_price,income,friends_count,days_visited,avg_hours,age,songs_purchased,has_membership,is_US,account_age,U) = P(log_demand|log_price,income,friends_count,days_visited,avg_hours,age,songs_purchased,has_membership,is_US,account_age)\n", - "\n", - "## Realized estimand\n", - "b: log_demand~log_price+income+friends_count+days_visited+avg_hours+age+songs_purchased+has_membership+is_US+account_age | income\n", - "Target units: ate\n", - "\n", - "## Estimate\n", - "Mean value: 2.6518132830256684\n", - "Effect estimates: [ 2.57968831 -0.23224908 4.35502223 ... 0.85234463 -3.53167996\n", - " 6.99294565]\n", - "\n" - ] - } - ], - "source": [ - "# Import the sample pricing data\n", - "file_url = \"https://msalicedatapublic.z5.web.core.windows.net/datasets/Pricing/pricing_sample.csv\"\n", - "train_data = pd.read_csv(file_url)\n", - "\n", - "# Data sample\n", - "train_data.head()\n", - "\n", - "# Define estimator inputs\n", - "train_data[\"log_demand\"] = np.log(train_data[\"demand\"])\n", - "train_data[\"log_price\"] = np.log(train_data[\"price\"])\n", - "\n", - "Y = train_data[\"log_demand\"].values\n", - "T = train_data[\"log_price\"].values\n", - "X = train_data[[\"income\"]].values # features\n", - "confounder_names = [\"account_age\", \"age\", \"avg_hours\", \"days_visited\", \"friends_count\", \"has_membership\", \"is_US\", \"songs_purchased\"]\n", - "W = train_data[confounder_names].values\n", - "\n", - "# Get test data\n", - "X_test = np.linspace(0, 5, 100).reshape(-1, 1)\n", - "X_test_data = pd.DataFrame(X_test, columns=[\"income\"])\n", - "\n", - "# initiate an EconML cate estimator\n", - "est = LinearDML(model_y='gbf', model_t='gbf',\n", - " featurizer=PolynomialFeatures(degree=2, include_bias=False))\n", - "\n", - "# fit through dowhy\n", - "est_dw = est.dowhy.fit(Y, T, X=X, W=W, outcome_names=[\"log_demand\"], treatment_names=[\"log_price\"], feature_names=[\"income\"],\n", - " confounder_names=confounder_names, inference=\"statsmodels\")\n", - "\n", - "lineardml_estimate = est_dw.estimate_\n", - "print(lineardml_estimate)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Define underlying treatment effect function given DGP\n", - "def gamma_fn(X):\n", - " return -3 - 14 * (X[\"income\"] < 1)\n", - "\n", - "def beta_fn(X):\n", - " return 20 + 0.5 * (X[\"avg_hours\"]) + 5 * (X[\"days_visited\"] > 4)\n", - "\n", - "def demand_fn(data, T):\n", - " Y = gamma_fn(data) * T + beta_fn(data)\n", - " return Y\n", - "\n", - "def true_te(x, n, stats):\n", - " if x < 1:\n", - " subdata = train_data[train_data[\"income\"] < 1].sample(n=n, replace=True)\n", - " else:\n", - " subdata = train_data[train_data[\"income\"] >= 1].sample(n=n, replace=True)\n", - " te_array = subdata[\"price\"] * gamma_fn(subdata) / (subdata[\"demand\"])\n", - " if stats == \"mean\":\n", - " return np.mean(te_array)\n", - " elif stats == \"median\":\n", - " return np.median(te_array)\n", - " elif isinstance(stats, int):\n", - " return np.percentile(te_array, stats)\n", - "\n", - "# Get the estimate and range of true treatment effect\n", - "truth_te_estimate = np.apply_along_axis(true_te, 1, X_test, 1000, \"mean\") # estimate\n", - "truth_te_upper = np.apply_along_axis(true_te, 1, X_test, 1000, 95) # upper level\n", - "truth_te_lower = np.apply_along_axis(true_te, 1, X_test, 1000, 5) # lower level\n", - "\n", - "te_pred = est_dw.effect(X_test).flatten()\n", - "te_pred_interval = est_dw.effect_interval(X_test)\n", - "\n", - "# Compare the estimate and the truth\n", - "plt.figure(figsize=(10, 6))\n", - "plt.plot(X_test.flatten(), te_pred, label=\"Sales Elasticity Prediction\")\n", - "plt.plot(X_test.flatten(), truth_te_estimate, \"--\", label=\"True Elasticity\")\n", - "plt.fill_between(\n", - " X_test.flatten(),\n", - " te_pred_interval[0].flatten(),\n", - " te_pred_interval[1].flatten(),\n", - " alpha=0.2,\n", - " label=\"95% Confidence Interval\",\n", - ")\n", - "plt.fill_between(\n", - " X_test.flatten(),\n", - " truth_te_lower,\n", - " truth_te_upper,\n", - " alpha=0.2,\n", - " label=\"True Elasticity Range\",\n", - ")\n", - "plt.xlabel(\"Income\")\n", - "plt.ylabel(\"Songs Sales Elasticity\")\n", - "plt.title(\"Songs Sales Elasticity vs Income\")\n", - "plt.legend(loc=\"lower right\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "No scoring value was given. Using default score method neg_mean_squared_error.\n", - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing estimator: RandomForestRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV] END .................................................... total time= 1.1s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV] END .................................................... total time= 0.7s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing estimator: MLPRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV] END .................................................... total time= 0.3s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV] END .................................................... total time= 0.4s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best estimator RandomForestRegressor() and best score -0.007087413279468611 and best params {}\n", - "Processing estimator: RandomForestRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV] END .................................................... total time= 2.3s\n", - "[CV] END .................................................... total time= 2.3s\n", - "Processing estimator: MLPRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV] END .................................................... total time= 12.6s\n", - "[CV] END .................................................... total time= 10.5s\n", - "Best estimator RandomForestRegressor() and best score -0.015753967716546576 and best params {}\n", - "Processing estimator: RandomForestRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV] END .................................................... total time= 0.7s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV] END .................................................... total time= 0.7s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing estimator: MLPRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV] END .................................................... total time= 0.2s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV] END .................................................... total time= 0.3s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best estimator RandomForestRegressor() and best score -0.006845612318994855 and best params {}\n", - "Processing estimator: RandomForestRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV] END .................................................... total time= 2.2s\n", - "[CV] END .................................................... total time= 2.1s\n", - "Processing estimator: MLPRegressor\n", - "Fitting 2 folds for each of 1 candidates, totalling 2 fits\n", - "[CV] END .................................................... total time= 12.2s\n", - "[CV] END .................................................... total time= 14.3s\n", - "Best estimator RandomForestRegressor() and best score -0.014455828883075759 and best params {}\n", - "*** Causal Estimate ***\n", - "\n", - "## Identified estimand\n", - "Estimand type: nonparametric-ate\n", - "\n", - "### Estimand : 1\n", - "Estimand name: backdoor\n", - "Estimand expression:\n", - " d \n", - "────────────(E[log_demand|income,friends_count,days_⟨visited,⟩_hours,age,songs\n", - "d[log_price] \n", - "\n", - " \n", - "_purchased,has_membership,is_US,account_age])\n", - " \n", - "Estimand assumption 1, Unconfoundedness: If U→{log_price} and U→log_demand then P(log_demand|log_price,income,friends_count,days_visited,avg_hours,age,songs_purchased,has_membership,is_US,account_age,U) = P(log_demand|log_price,income,friends_count,days_visited,avg_hours,age,songs_purchased,has_membership,is_US,account_age)\n", - "\n", - "## Realized estimand\n", - "b: log_demand~log_price+income+friends_count+days_visited+avg_hours+age+songs_purchased+has_membership+is_US+account_age | income\n", - "Target units: ate\n", - "\n", - "## Estimate\n", - "Mean value: -0.9764341213588181\n", - "Effect estimates: [-1.06939218 -1.44817143 -0.81689907 ... -1.30445479 -1.87209822\n", - " -0.40427838]\n", - "\n" - ] - } - ], - "source": [ - "# initiate an EconML cate estimator\n", - "\n", - "est = LinearDML(model_y=['forest', 'nnet'], model_t=['nnet', 'forest'], scaling=False,\n", - " featurizer=PolynomialFeatures(degree=2, include_bias=False))\n", - "\n", - "# fit through dowhy\n", - "est_dw = est.dowhy.fit(Y, T, X=X, W=W, outcome_names=[\"log_demand\"], treatment_names=[\"log_price\"], feature_names=[\"income\"],\n", - " confounder_names=confounder_names, inference=\"statsmodels\")\n", - "\n", - "lineardml_estimate = est_dw.estimate_\n", - "print(lineardml_estimate)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "te_pred = est_dw.effect(X_test).flatten()\n", - "te_pred_interval = est_dw.effect_interval(X_test)\n", - "\n", - "# Compare the estimate and the truth\n", - "plt.figure(figsize=(10, 6))\n", - "plt.plot(X_test.flatten(), te_pred, label=\"Sales Elasticity Prediction\")\n", - "plt.plot(X_test.flatten(), truth_te_estimate, \"--\", label=\"True Elasticity\")\n", - "plt.fill_between(\n", - " X_test.flatten(),\n", - " te_pred_interval[0].flatten(),\n", - " te_pred_interval[1].flatten(),\n", - " alpha=0.2,\n", - " label=\"95% Confidence Interval\",\n", - ")\n", - "plt.fill_between(\n", - " X_test.flatten(),\n", - " truth_te_lower,\n", - " truth_te_upper,\n", - " alpha=0.2,\n", - " label=\"True Elasticity Range\",\n", - ")\n", - "plt.xlabel(\"Income\")\n", - "plt.ylabel(\"Songs Sales Elasticity\")\n", - "plt.title(\"Songs Sales Elasticity vs Income\")\n", - "plt.legend(loc=\"lower right\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.15" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -}