From e335d152f06780affd1a070621141d84f115ffd9 Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Sat, 11 Nov 2023 14:57:59 -0500 Subject: [PATCH] Enable model selection for first stage models (#808) Enables model selection for _OrthoLearner first stage models, and changes all concrete subclasses to allow selection between linear and random forest models for all first stage models by default. --------- Signed-off-by: AnthonyCampbell208 <78286293+AnthonyCampbell208@users.noreply.github.com> Signed-off-by: Keith Battocchi Co-authored-by: AnthonyCampbell208 <78286293+AnthonyCampbell208@users.noreply.github.com> Co-authored-by: ShrutiRM97 <98553136+ShrutiRM97@users.noreply.github.com> Co-authored-by: CooperGibbs --- econml/_ortho_learner.py | 40 ++- econml/dml/_rlearner.py | 50 +-- econml/dml/causal_forest.py | 24 +- econml/dml/dml.py | 186 +++++----- econml/dr/_drlearner.py | 118 ++++--- econml/iv/dml/_dml.py | 234 +++++-------- econml/iv/dr/_dr.py | 299 +++++++--------- econml/panel/dml/_dml.py | 92 +++-- econml/sklearn_extensions/linear_model.py | 109 +++--- econml/sklearn_extensions/model_selection.py | 340 ++++++++++++++++++- econml/tests/test_dml.py | 44 ++- econml/tests/test_dmliv.py | 26 +- econml/tests/test_driv.py | 58 +++- econml/tests/test_drlearner.py | 14 +- econml/tests/test_missing_values.py | 2 +- econml/tests/test_ortho_learner.py | 14 +- econml/tests/test_refit.py | 12 +- econml/tests/test_treatment_featurization.py | 71 ++-- econml/tests/utilities.py | 17 +- econml/utilities.py | 57 ---- notebooks/Scaling EconML using Ray.ipynb | 36 +- 21 files changed, 1028 insertions(+), 815 deletions(-) diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py index 15d7b7af3..d41ec66c9 100644 --- a/econml/_ortho_learner.py +++ b/econml/_ortho_learner.py @@ -45,6 +45,7 @@ class in this module implements the general logic in a very versatile way from .utilities import (_deprecate_positional, check_input_arrays, cross_product, filter_none_kwargs, inverse_onehot, jacify_featurizer, ndim, reshape, shape, transpose) +from .sklearn_extensions.model_selection import ModelSelector try: import ray @@ -100,7 +101,7 @@ def _fit_fold(model, train_idxs, test_idxs, calculate_scores, args, kwargs): kwargs_train = {key: var[train_idxs] for key, var in kwargs.items()} kwargs_test = {key: var[test_idxs] for key, var in kwargs.items()} - model.fit(*args_train, **kwargs_train) + model.train(False, *args_train, **kwargs_train) nuisance_temp = model.predict(*args_test, **kwargs_test) if not isinstance(nuisance_temp, tuple): @@ -115,17 +116,18 @@ def _fit_fold(model, train_idxs, test_idxs, calculate_scores, args, kwargs): return nuisance_temp, model, test_idxs, (score_temp if calculate_scores else None) -def _crossfit(model, folds, use_ray, ray_remote_fun_option, *args, **kwargs): +def _crossfit(model: ModelSelector, folds, use_ray, ray_remote_fun_option, *args, **kwargs): """ General crossfit based calculation of nuisance parameters. Parameters ---------- - model : object - An object that supports fit and predict. Fit must accept all the args - and the keyword arguments kwargs. Similarly predict must all accept - all the args as arguments and kwards as keyword arguments. The fit - function estimates a model of the nuisance function, based on the input + model : ModelSelector + An object that has train and predict methods. + The train method must take an 'is_selecting' argument first, and then + accept positional arguments `args` and keyword arguments `kwargs`; the predict method + just takes those `args` and `kwargs`. The train + method selects or estimates a model of the nuisance function, based on the input data to fit. Predict evaluates the fitted nuisance function on the input data to predict. folds : list of tuple or None @@ -177,7 +179,7 @@ def _crossfit(model, folds, use_ray, ray_remote_fun_option, *args, **kwargs): class Wrapper: def __init__(self, model): self._model = model - def fit(self, X, y, W=None): + def train(self, is_selecting, X, y, W=None): self._model.fit(X, y) return self def predict(self, X, y, W=None): @@ -202,13 +204,17 @@ def predict(self, X, y, W=None): """ model_list = [] + + kwargs = filter_none_kwargs(**kwargs) + model.train(True, *args, **kwargs) + calculate_scores = hasattr(model, 'score') # remove None arguments - kwargs = filter_none_kwargs(**kwargs) if folds is None: # skip crossfitting model_list.append(clone(model, safe=False)) - model_list[0].fit(*args, **kwargs) + model_list[0].train(True, *args, **kwargs) + model_list[0].train(False, *args, **kwargs) # fit the selected model nuisances = model_list[0].predict(*args, **kwargs) scores = model_list[0].score(*args, **kwargs) if calculate_scores else None @@ -394,7 +400,7 @@ class ModelNuisance: def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self @@ -448,7 +454,7 @@ class ModelNuisance: def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, np.matmul(T, np.arange(1, T.shape[1]+1))) self._model_y.fit(W, Y) return self @@ -532,15 +538,15 @@ def _gen_allowed_missing_vars(self): @abstractmethod def _gen_ortho_learner_model_nuisance(self): - """ Must return a fresh instance of a nuisance model + """Must return a fresh instance of a nuisance model selector Returns ------- - model_nuisance: estimator - The estimator for fitting the nuisance function. Must implement - `fit` and `predict` methods that both have signatures:: + model_nuisance: selector + The selector for fitting the nuisance function. The returned estimator must implement + `train` and `predict` methods that both have signatures:: - model_nuisance.fit(Y, T, X=X, W=W, Z=Z, + model_nuisance.train(is_selecting, Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight) model_nuisance.predict(Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight) diff --git a/econml/dml/_rlearner.py b/econml/dml/_rlearner.py index bd645fda3..a020b432d 100644 --- a/econml/dml/_rlearner.py +++ b/econml/dml/_rlearner.py @@ -29,40 +29,35 @@ import numpy as np import copy from warnings import warn + +from ..sklearn_extensions.model_selection import ModelSelector from ..utilities import (shape, reshape, ndim, hstack, filter_none_kwargs, _deprecate_positional) from sklearn.linear_model import LinearRegression from sklearn.base import clone from .._ortho_learner import _OrthoLearner -class _ModelNuisance: +class _ModelNuisance(ModelSelector): """ Nuisance model fits the model_y and model_t at fit time and at predict time calculates the residual Y and residual T based on the fitted models and returns the residuals as two nuisance parameters. """ - def __init__(self, model_y, model_t): + def __init__(self, model_y: ModelSelector, model_t: ModelSelector): self._model_y = model_y self._model_t = model_t - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): assert Z is None, "Cannot accept instrument!" - self._model_t.fit(X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) - self._model_y.fit(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + self._model_t.train(is_selecting, X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + self._model_y.train(is_selecting, X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) return self def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): - if hasattr(self._model_y, 'score'): - # note that groups are not passed to score because they are only used for fitting - Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight)) - else: - Y_score = None - if hasattr(self._model_t, 'score'): - # note that groups are not passed to score because they are only used for fitting - T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight)) - else: - T_score = None + # note that groups are not passed to score because they are only used for fitting + T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight)) + Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight)) return Y_score, T_score def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): @@ -208,6 +203,7 @@ class _RLearner(_OrthoLearner): import numpy as np from sklearn.linear_model import LinearRegression from econml.dml._rlearner import _RLearner + from econml.sklearn_extensions.model_selection import SingleModelSelector from sklearn.base import clone class ModelFirst: def __init__(self, model): @@ -217,6 +213,18 @@ def fit(self, X, W, Y, sample_weight=None): return self def predict(self, X, W): return self._model.predict(np.hstack([X, W])) + class ModelSelector(SingleModelSelector): + def __init__(self, model): + self._model = ModelFirst(model) + def train(self, is_selecting, X, W, Y, sample_weight=None): + self._model.fit(X, W, Y, sample_weight=sample_weight) + return self + @property + def best_model(self): + return self._model + @property + def best_score(self): + return 0 class ModelFinal: def fit(self, X, T, T_res, Y_res, sample_weight=None, freq_weight=None, sample_var=None): self.model = LinearRegression(fit_intercept=False).fit(X * T_res.reshape(-1, 1), @@ -226,9 +234,9 @@ def predict(self, X): return self.model.predict(X) class RLearner(_RLearner): def _gen_model_y(self): - return ModelFirst(LinearRegression()) + return ModelSelector(LinearRegression()) def _gen_model_t(self): - return ModelFirst(LinearRegression()) + return ModelSelector(LinearRegression()) def _gen_rlearner_model_final(self): return ModelFinal() np.random.seed(123) @@ -302,7 +310,7 @@ def _gen_model_y(self): """ Returns ------- - model_y: estimator of E[Y | X, W] + model_y: selector for the estimator of E[Y | X, W] The estimator for fitting the response to the features and controls. Must implement `fit` and `predict` methods. Unlike sklearn estimators both methods must take an extra second argument (the controls), i.e. :: @@ -317,7 +325,7 @@ def _gen_model_t(self): """ Returns ------- - model_t: estimator of E[T | X, W] + model_t: selector for the estimator of E[T | X, W] The estimator for fitting the treatment to the features and controls. Must implement `fit` and `predict` methods. Unlike sklearn estimators both methods must take an extra second argument (the controls), i.e. :: @@ -432,11 +440,11 @@ def rlearner_model_final_(self): @property def models_y(self): - return [[mdl._model_y for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_y.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t(self): - return [[mdl._model_t for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def nuisance_scores_y(self): diff --git a/econml/dml/causal_forest.py b/econml/dml/causal_forest.py index 4f038eb3f..2672b3c8a 100644 --- a/econml/dml/causal_forest.py +++ b/econml/dml/causal_forest.py @@ -11,7 +11,7 @@ from sklearn.model_selection import train_test_split from itertools import product from .dml import _BaseDML -from .dml import _FirstStageWrapper +from .dml import _make_first_stage_selector from ..sklearn_extensions.linear_model import WeightedLassoCVWrapper from ..sklearn_extensions.model_selection import WeightedStratifiedKFold from ..inference import NormalInferenceResults @@ -548,10 +548,10 @@ class CausalForestDML(_BaseDML): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.76625..., 1.52176..., 0.73679...]) + array([0.88518..., 1.25061..., 0.81112...]) >>> est.effect_interval(X[:3]) - (array([0.39668..., 1.08245... , 0.16566...]), - array([1.13581..., 1.96107..., 1.30791...])) + (array([0.40163..., 0.75023..., 0.46629...]), + array([1.36873..., 1.75099..., 1.15596...])) Attributes ---------- @@ -668,22 +668,10 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y(self): - if self.model_y == 'auto': - model_y = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y = clone(self.model_y, safe=False) - return _FirstStageWrapper(model_y, True, self._gen_featurizer(), False, self.discrete_treatment) + return _make_first_stage_selector(self.model_y, False, self.random_state) def _gen_model_t(self): - if self.model_t == 'auto': - if self.discrete_treatment: - model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t = clone(self.model_t, safe=False) - return _FirstStageWrapper(model_t, False, self._gen_featurizer(), False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t, self.discrete_treatment, self.random_state) def _gen_model_final(self): return MultiOutputGRF(CausalForest(n_estimators=self.n_estimators, diff --git a/econml/dml/dml.py b/econml/dml/dml.py index ce579d519..be06d9b9e 100644 --- a/econml/dml/dml.py +++ b/econml/dml/dml.py @@ -29,74 +29,85 @@ from ..sklearn_extensions.model_selection import WeightedStratifiedKFold from ..utilities import (_deprecate_positional, add_intercept, broadcast_unit_treatments, check_high_dimensional, - cross_product, deprecated, fit_with_groups, + cross_product, deprecated, hstack, inverse_onehot, ndim, reshape, reshape_treatmentwise_effects, shape, transpose, get_feature_names_or_default, filter_none_kwargs) from .._shap import _shap_explain_model_cate +from ..sklearn_extensions.model_selection import get_selector, ModelSelector, SingleModelSelector -class _FirstStageWrapper: - def __init__(self, model, is_Y, featurizer, linear_first_stages, discrete_treatment): - self._model = clone(model, safe=False) - self._featurizer = clone(featurizer, safe=False) - self._is_Y = is_Y - self._linear_first_stages = linear_first_stages - self._discrete_treatment = discrete_treatment - - def _combine(self, X, W, n_samples, fitting=True): - if X is None: - # if both X and W are None, just return a column of ones - return (W if W is not None else np.ones((n_samples, 1))) - XW = hstack([X, W]) if W is not None else X - if self._is_Y and self._linear_first_stages: - if self._featurizer is None: - F = X - else: - F = self._featurizer.fit_transform(X) if fitting else self._featurizer.transform(X) - return cross_product(XW, hstack([np.ones((shape(XW)[0], 1)), F])) - else: - return XW +def _combine(X, W, n_samples): + if X is None: + # if both X and W are None, just return a column of ones + return (W if W is not None else np.ones((n_samples, 1))) + return hstack([X, W]) if W is not None else X - def fit(self, X, W, Target, sample_weight=None, groups=None): - if (not self._is_Y) and self._discrete_treatment: - # In this case, the Target is the one-hot-encoding of the treatment variable - # We need to go back to the label representation of the one-hot so as to call - # the classifier. - if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))): - raise AttributeError("Provided crossfit folds contain training splits that " + - "don't contain all treatments") - Target = inverse_onehot(Target) - if sample_weight is not None: - fit_with_groups(self._model, self._combine(X, W, Target.shape[0]), Target, groups=groups, - sample_weight=sample_weight) - else: - fit_with_groups(self._model, self._combine(X, W, Target.shape[0]), Target, groups=groups) - return self +class _FirstStageWrapper: + def __init__(self, model, discrete_target): + self._model = model # plain sklearn-compatible model, not a ModelSelector + self._discrete_target = discrete_target def predict(self, X, W): n_samples = X.shape[0] if X is not None else (W.shape[0] if W is not None else 1) - if (not self._is_Y) and self._discrete_treatment: - return self._model.predict_proba(self._combine(X, W, n_samples, fitting=False))[:, 1:] + if self._discrete_target: + return self._model.predict_proba(_combine(X, W, n_samples))[:, 1:] else: - return self._model.predict(self._combine(X, W, n_samples, fitting=False)) + return self._model.predict(_combine(X, W, n_samples)) def score(self, X, W, Target, sample_weight=None): if hasattr(self._model, 'score'): - if (not self._is_Y) and self._discrete_treatment: + if self._discrete_target: # In this case, the Target is the one-hot-encoding of the treatment variable # We need to go back to the label representation of the one-hot so as to call # the classifier. Target = inverse_onehot(Target) if sample_weight is not None: - return self._model.score(self._combine(X, W, Target.shape[0]), Target, sample_weight=sample_weight) + return self._model.score(_combine(X, W, Target.shape[0]), Target, sample_weight=sample_weight) else: - return self._model.score(self._combine(X, W, Target.shape[0]), Target) + return self._model.score(_combine(X, W, Target.shape[0]), Target) else: return None +class _FirstStageSelector(SingleModelSelector): + def __init__(self, model: SingleModelSelector, discrete_target): + self._model = clone(model, safe=False) + self._discrete_target = discrete_target + + def train(self, is_selecting, X, W, Target, sample_weight=None, groups=None): + if self._discrete_target: + # In this case, the Target is the one-hot-encoding of the treatment variable + # We need to go back to the label representation of the one-hot so as to call + # the classifier. + if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))): + raise AttributeError("Provided crossfit folds contain training splits that " + + "don't contain all treatments") + Target = inverse_onehot(Target) + + self._model.train(is_selecting, _combine(X, W, Target.shape[0]), Target, + **filter_none_kwargs(groups=groups, sample_weight=sample_weight)) + return self + + @property + def best_model(self): + return _FirstStageWrapper(self._model.best_model, self._discrete_target) + + @property + def best_score(self): + return self._model.best_score + + +def _make_first_stage_selector(model, is_discrete, random_state): + if model == 'auto': + model = ['forest', 'linear'] + return _FirstStageSelector(get_selector(model, + is_discrete=is_discrete, + random_state=random_state), + discrete_target=is_discrete) + + class _FinalWrapper: def __init__(self, model_final, fit_cate_intercept, featurizer, use_weight_trick): self._model = clone(model_final, safe=False) @@ -380,6 +391,9 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values). The first category will be treated as the control treatment. + verbose: int, default 2 + The verbosity level of the output messages. Higher values indicate more verbosity. + cv: int, cross-validation generator or an iterable, default 2 Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -453,18 +467,19 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.63382..., 1.78225..., 0.71859...]) + array([0.65142..., 1.82917..., 0.79287...]) >>> est.effect_interval(X[:3]) - (array([0.27937..., 1.27619..., 0.42091...]),...([0.98827... , 2.28831..., 1.01628...])) + (array([0.28936..., 1.31239..., 0.47626...]), + array([1.01348..., 2.34594..., 1.10949...])) >>> est.coef_ - array([ 0.42857..., 0.04488..., -0.03317..., 0.02258..., -0.14875...]) + array([ 0.32570..., -0.05311..., -0.03973..., 0.01598..., -0.11045...]) >>> est.coef__interval() - (array([ 0.25179..., -0.10558..., -0.16723... , -0.11916..., -0.28759...]), - array([ 0.60535..., 0.19536..., 0.10088..., 0.16434..., -0.00990...])) + (array([ 0.13791..., -0.20081..., -0.17941..., -0.12073..., -0.25769...]), + array([0.51348..., 0.09458..., 0.09993..., 0.15269..., 0.03679...])) >>> est.intercept_ - 1.01166... + 1.02940... >>> est.intercept__interval() - (0.87125..., 1.15207...) + (0.88754..., 1.17125...) """ def __init__(self, *, @@ -472,7 +487,7 @@ def __init__(self, *, featurizer=None, treatment_featurizer=None, fit_cate_intercept=True, - linear_first_stages=False, + linear_first_stages="deprecated", discrete_treatment=False, categories='auto', cv=2, @@ -483,9 +498,10 @@ def __init__(self, *, use_ray=False, ray_remote_func_options=None ): - # TODO: consider whether we need more care around stateful featurizers, - # since we clone it and fit separate copies self.fit_cate_intercept = fit_cate_intercept + if linear_first_stages != "deprecated": + warn("The linear_first_stages parameter is deprecated and will be removed in a future version of EconML", + DeprecationWarning) self.linear_first_stages = linear_first_stages self.featurizer = clone(featurizer, safe=False) self.model_y = clone(model_y, safe=False) @@ -509,24 +525,10 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y(self): - if self.model_y == 'auto': - model_y = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y = clone(self.model_y, safe=False) - return _FirstStageWrapper(model_y, True, self._gen_featurizer(), - self.linear_first_stages, self.discrete_treatment) + return _make_first_stage_selector(self.model_y, False, self.random_state) def _gen_model_t(self): - if self.model_t == 'auto': - if self.discrete_treatment: - model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t = clone(self.model_t, safe=False) - return _FirstStageWrapper(model_t, False, self._gen_featurizer(), - self.linear_first_stages, self.discrete_treatment) + return _make_first_stage_selector(self.model_t, self.discrete_treatment, self.random_state) def _gen_model_final(self): return clone(self.model_final, safe=False) @@ -698,20 +700,19 @@ class LinearDML(StatsModelsCateEstimatorMixin, DML): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.59252... , 1.74657..., 0.77384...]) + array([0.60257..., 1.74564..., 0.72062...]) >>> est.effect_interval(X[:3]) - (array([0.25503..., 1.24556..., 0.48440...]), - array([0.93002... , 2.24757..., 1.06328... ])) + (array([0.25760..., 1.24005..., 0.41770...]), + array([0.94754..., 2.25123..., 1.02354...])) >>> est.coef_ - array([ 0.39746..., -0.00313..., 0.01346..., 0.01402..., -0.09071...]) + array([ 0.41635..., 0.00287..., -0.01831..., -0.01197..., -0.11620...]) >>> est.coef__interval() - (array([ 0.23709..., -0.13618... , -0.11712..., -0.11954..., -0.22782...]), - array([0.55783..., 0.12991..., 0.14405..., 0.14758..., 0.04640...])) + (array([ 0.24496..., -0.13418..., -0.14852..., -0.13947..., -0.25089...]), + array([0.58775..., 0.13993..., 0.11189..., 0.11551..., 0.01848...])) >>> est.intercept_ - 0.99197... + 0.97162... >>> est.intercept__interval() - (0.85855..., 1.12539...) - + (0.83640..., 1.10684...) """ def __init__(self, *, @@ -719,7 +720,7 @@ def __init__(self, *, featurizer=None, treatment_featurizer=None, fit_cate_intercept=True, - linear_first_stages=True, + linear_first_stages="deprecated", discrete_treatment=False, categories='auto', cv=2, @@ -954,19 +955,19 @@ class SparseLinearDML(DebiasedLassoCateEstimatorMixin, DML): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.59401..., 1.74717..., 0.77105...]) + array([0.59812..., 1.75138..., 0.71770...]) >>> est.effect_interval(X[:3]) - (array([0.26608..., 1.26369..., 0.48690...]), - array([0.92195..., 2.23066..., 1.05520...])) + (array([0.25046..., 1.24249..., 0.42606...]), + array([0.94577..., 2.26028..., 1.00935... ])) >>> est.coef_ - array([ 0.39857..., -0.00101... , 0.01112..., 0.01457..., -0.09117...]) + array([ 0.41820..., 0.00506..., -0.01831..., -0.00778..., -0.11965...]) >>> est.coef__interval() - (array([ 0.24285..., -0.13728..., -0.12351..., -0.11585..., -0.22974...]), - array([0.55430..., 0.13526..., 0.14576..., 0.14501... , 0.04738...])) + (array([ 0.25058..., -0.13713..., -0.15469..., -0.13932..., -0.26252...]), + array([0.58583..., 0.14726..., 0.11806..., 0.12376..., 0.02320...])) >>> est.intercept_ - 0.99378... + 0.97131... >>> est.intercept__interval() - (0.86045..., 1.12711...) + (0.83363..., 1.10899...) """ def __init__(self, *, @@ -1203,7 +1204,7 @@ class KernelDML(DML): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.59341..., 1.54740..., 0.69454... ]) + array([0.64124..., 1.46561..., 0.68568...]) """ def __init__(self, model_y='auto', model_t='auto', @@ -1411,7 +1412,7 @@ class NonParamDML(_BaseDML): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([0.35318..., 1.28760..., 0.83506...]) + array([0.32389..., 0.85703..., 0.97468...]) """ def __init__(self, *, @@ -1458,12 +1459,11 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y(self): - return _FirstStageWrapper(clone(self.model_y, safe=False), True, - self._gen_featurizer(), False, self.discrete_treatment) + return _make_first_stage_selector(self.model_y, is_discrete=False, random_state=self.random_state) def _gen_model_t(self): - return _FirstStageWrapper(clone(self.model_t, safe=False), False, - self._gen_featurizer(), False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t, is_discrete=self.discrete_treatment, + random_state=self.random_state) def _gen_model_final(self): return clone(self.model_final, safe=False) diff --git a/econml/dr/_drlearner.py b/econml/dr/_drlearner.py index 9b75ca75d..749eec5d9 100644 --- a/econml/dr/_drlearner.py +++ b/econml/dr/_drlearner.py @@ -43,6 +43,7 @@ LogisticRegressionCV) from sklearn.ensemble import RandomForestRegressor + from .._ortho_learner import _OrthoLearner from .._cate_estimator import (DebiasedLassoCateEstimatorDiscreteMixin, BaseCateEstimator, ForestModelFinalCateEstimatorDiscreteMixin, @@ -51,13 +52,17 @@ from ..grf import RegressionForest from ..sklearn_extensions.linear_model import ( DebiasedLasso, StatsModelsLinearRegression, WeightedLassoCVWrapper) +from ..sklearn_extensions.model_selection import ModelSelector, SingleModelSelector, get_selector from ..utilities import (_deprecate_positional, check_high_dimensional, - filter_none_kwargs, fit_with_groups, inverse_onehot, get_feature_names_or_default) + filter_none_kwargs, inverse_onehot, get_feature_names_or_default) from .._shap import _shap_explain_multitask_model_cate, _shap_explain_model_cate -class _ModelNuisance: - def __init__(self, model_propensity, model_regression, min_propensity): +class _ModelNuisance(ModelSelector): + def __init__(self, + model_propensity: SingleModelSelector, + model_regression: SingleModelSelector, + min_propensity): self._model_propensity = model_propensity self._model_regression = model_regression self._min_propensity = min_propensity @@ -65,7 +70,7 @@ def __init__(self, model_propensity, model_regression, min_propensity): def _combine(self, X, W): return np.hstack([arr for arr in [X, W] if arr is not None]) - def fit(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None): + def train(self, is_selecting, Y, T, X=None, W=None, *, sample_weight=None, groups=None): if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1): raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), " "instead got {1}.".format(len(X), Y.shape)) @@ -77,22 +82,16 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None): XW = self._combine(X, W) filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight) - fit_with_groups(self._model_propensity, XW, inverse_onehot(T), groups=groups, **filtered_kwargs) - fit_with_groups(self._model_regression, np.hstack([XW, T]), Y, groups=groups, **filtered_kwargs) + self._model_propensity.train(is_selecting, XW, inverse_onehot(T), groups=groups, **filtered_kwargs) + self._model_regression.train(is_selecting, np.hstack([XW, T]), Y, groups=groups, **filtered_kwargs) return self def score(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None): XW = self._combine(X, W) filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight) - if hasattr(self._model_propensity, 'score'): - propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs) - else: - propensity_score = None - if hasattr(self._model_regression, 'score'): - regression_score = self._model_regression.score(np.hstack([XW, T]), Y, **filtered_kwargs) - else: - regression_score = None + propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs) + regression_score = self._model_regression.score(np.hstack([XW, T]), Y, **filtered_kwargs) return propensity_score, regression_score @@ -114,6 +113,12 @@ def predict(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None): return Y_pred.reshape(Y.shape + (T.shape[1] + 1,)), propensities_weight.reshape((n,)) +def _make_first_stage_selector(model, is_discrete, random_state): + if model == "auto": + model = ['linear', 'forest'] + return get_selector(model, is_discrete=is_discrete, random_state=random_state) + + class _ModelFinal: # Coding Remark: The reasoning around the multitask_model_final could have been simplified if # we simply wrapped the model_final with a MultiOutputRegressor. However, because we also want @@ -335,30 +340,29 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc est.fit(y, T, X=X, W=None) >>> est.const_marginal_effect(X[:2]) - array([[0.511640..., 1.144004...], - [0.378140..., 0.613143...]]) + array([[0.520977..., 1.244073...], + [0.365645..., 0.749762...]]) >>> est.effect(X[:2], T0=0, T1=1) - array([0.511640..., 0.378140...]) + array([0.520977..., 0.365645...]) >>> est.score_ - 5.11238581... + 3.15958089... >>> est.score(y, T, X=X) - 5.78673506... + 2.60965712... >>> est.model_cate(T=1).coef_ - array([0.434910..., 0.010226..., 0.047913...]) + array([0.369069..., 0.016610..., 0.019072...]) >>> est.model_cate(T=2).coef_ - array([ 0.863723..., 0.086946..., -0.022288...]) + array([ 0.768336..., 0.082106..., -0.030475...]) >>> est.cate_feature_names() ['X0', 'X1', 'X2'] >>> [mdl.coef_ for mdls in est.models_regression for mdl in mdls] - [array([ 1.472..., 0.001..., -0.011..., 0.698..., 2.049...]), - array([ 1.455..., -0.002..., 0.005..., 0.677..., 1.998...])] + [array([ 1.463..., 0.006..., -0.006..., 0.726..., 2.029...]), + array([ 1.466..., -0.002..., 0..., 0.646..., 2.014...])] >>> [mdl.coef_ for mdls in est.models_propensity for mdl in mdls] - [array([[-0.747..., 0.153..., -0.018...], - [ 0.083..., -0.110..., -0.076...], - [ 0.663..., -0.043... , 0.094...]]), - array([[-1.048..., 0.000..., 0.032...], - [ 0.019..., 0.124..., -0.081...], - [ 1.029..., -0.124..., 0.049...]])] + [array([[-0.67903093, 0.04261741, -0.05969718], + [ 0.034..., -0.013..., -0.013...], + [ 0.644..., -0.028..., 0.073...]]), array([[-0.831..., 0.100..., 0.090...], + [ 0.084..., 0.013..., -0.154...], + [ 0.747..., -0.113..., 0.063...]])] Beyond default models: @@ -380,19 +384,19 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc est.fit(y, T, X=X, W=None) >>> est.score_ - 1.7... + 3.7... >>> est.const_marginal_effect(X[:3]) - array([[0.68..., 1.10...], - [0.56..., 0.79...], - [0.34..., 0.10...]]) + array([[0.64..., 1.23...], + [0.49..., 0.92...], + [0.20..., 0.26...]]) >>> est.model_cate(T=2).coef_ - array([0.74..., 0. , 0. ]) + array([0.72..., 0. , 0. ]) >>> est.model_cate(T=2).intercept_ - 1.9... + 2.0... >>> est.model_cate(T=1).coef_ - array([0.24..., 0.00..., 0. ]) + array([0.31..., 0.01..., 0.00...]) >>> est.model_cate(T=1).intercept_ - 0.94... + 0.97... Attributes ---------- @@ -499,16 +503,8 @@ def _get_inference_options(self): return options def _gen_ortho_learner_model_nuisance(self): - if self.model_propensity == 'auto': - model_propensity = LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto', - random_state=self.random_state) - else: - model_propensity = clone(self.model_propensity, safe=False) - - if self.model_regression == 'auto': - model_regression = WeightedLassoCVWrapper(cv=3, random_state=self.random_state) - else: - model_regression = clone(self.model_regression, safe=False) + model_propensity = _make_first_stage_selector(self.model_propensity, True, self.random_state) + model_regression = _make_first_stage_selector(self.model_regression, False, self.random_state) return _ModelNuisance(model_propensity, model_regression, self.min_propensity) @@ -648,7 +644,7 @@ def models_propensity(self): monte carlo iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_propensity for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_propensity.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_regression(self): @@ -662,7 +658,7 @@ def models_regression(self): monte carlo iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_regression for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_regression.best_model for mdl in mdls] for mdls in super().models_nuisance_] @property def nuisance_scores_propensity(self): @@ -868,17 +864,17 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([ 0.409743..., 0.312604..., -0.127394...]) + array([0.457602..., 0.335707..., 0.011288...]) >>> est.effect_interval(X[:3]) - (array([ 0.065306..., -0.182074..., -0.765901...]), array([0.754180..., 0.807284..., 0.511113...])) + (array([ 0.164623..., -0.098980..., -0.493464...]), array([0.750582..., 0.77039... , 0.516041...])) >>> est.coef_(T=1) - array([ 0.450779..., -0.003214... , 0.063884... ]) + array([0.338061..., 0.025654..., 0.044389...]) >>> est.coef__interval(T=1) - (array([ 0.155111..., -0.246272..., -0.136827...]), array([0.746447..., 0.239844..., 0.264595...])) + (array([ 0.135677..., -0.155845..., -0.143376...]), array([0.540446..., 0.207155..., 0.232155...])) >>> est.intercept_(T=1) - 0.88425066... + 0.78646497... >>> est.intercept__interval(T=1) - (0.64868548..., 1.11981585...) + (0.60344468..., 0.96948526...) Attributes ---------- @@ -1161,17 +1157,17 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner): est.fit(y, T, X=X, W=None) >>> est.effect(X[:3]) - array([ 0.41..., 0.31..., -0.12...]) + array([0.45..., 0.33..., 0.01...]) >>> est.effect_interval(X[:3]) - (array([-0.02..., -0.29... , -0.84...]), array([0.84..., 0.92..., 0.59...])) + (array([ 0.11..., -0.13..., -0.54...]), array([0.79..., 0.80..., 0.57...])) >>> est.coef_(T=1) - array([ 0.45..., -0.00..., 0.06...]) + array([0.33..., 0.02..., 0.04...]) >>> est.coef__interval(T=1) - (array([ 0.20..., -0.23..., -0.17...]), array([0.69..., 0.23..., 0.30...])) + (array([ 0.14..., -0.15..., -0.14...]), array([0.53..., 0.20..., 0.23...])) >>> est.intercept_(T=1) - 0.88... + 0.78... >>> est.intercept__interval(T=1) - (0.64..., 1.11...) + (0.60..., 0.96...) Attributes ---------- diff --git a/econml/iv/dml/_dml.py b/econml/iv/dml/_dml.py index c8889599f..b42925ff9 100644 --- a/econml/iv/dml/_dml.py +++ b/econml/iv/dml/_dml.py @@ -24,17 +24,30 @@ from ..._cate_estimator import LinearModelFinalCateEstimatorMixin, StatsModelsCateEstimatorMixin, LinearCateEstimator from ...inference import StatsModelsInference, GenericSingleTreatmentModelFinalInference from ...sklearn_extensions.linear_model import StatsModels2SLS, StatsModelsLinearRegression, WeightedLassoCVWrapper -from ...sklearn_extensions.model_selection import WeightedStratifiedKFold +from ...sklearn_extensions.model_selection import (ModelSelector, SingleModelSelector, + WeightedStratifiedKFold, get_selector) from ...utilities import (_deprecate_positional, get_feature_names_or_default, filter_none_kwargs, add_intercept, cross_product, broadcast_unit_treatments, reshape_treatmentwise_effects, shape, parse_final_model_params, deprecated, Summary) -from ...dml.dml import _FirstStageWrapper, _FinalWrapper +from ...dml.dml import _make_first_stage_selector, _FinalWrapper from ...dml._rlearner import _ModelFinal from ..._shap import _shap_explain_joint_linear_model_cate, _shap_explain_model_cate -class _OrthoIVModelNuisance: - def __init__(self, model_y_xw, model_t_xw, model_z, projection): +def _combine(W, Z, n_samples): + if Z is not None: + Z = Z.reshape(n_samples, -1) + return Z if W is None else np.hstack([W, Z]) + return None if W is None else W + + +class _OrthoIVNuisanceSelector(ModelSelector): + + def __init__(self, + model_y_xw: SingleModelSelector, + model_t_xw: SingleModelSelector, + model_z: SingleModelSelector, + projection): self._model_y_xw = model_y_xw self._model_t_xw = model_t_xw self._projection = projection @@ -43,21 +56,15 @@ def __init__(self, model_y_xw, model_t_xw, model_z, projection): else: self._model_z_xw = model_z - def _combine(self, W, Z, n_samples): - if Z is not None: - Z = Z.reshape(n_samples, -1) - return Z if W is None else np.hstack([W, Z]) - return None if W is None else W - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): - self._model_y_xw.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) - self._model_t_xw.fit(X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + self._model_y_xw.train(is_selecting, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) + self._model_t_xw.train(is_selecting, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) if self._projection: # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) - self._model_t_xwz.fit(X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) + WZ = _combine(W, Z, Y.shape[0]) + self._model_t_xwz.train(is_selecting, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) else: - self._model_z_xw.fit(X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) + self._model_z_xw.train(is_selecting, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) return self def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): @@ -71,7 +78,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): T_X_score = None if self._projection: # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) if hasattr(self._model_t_xwz, 'score'): T_XZ_score = self._model_t_xwz.score(X=X, W=WZ, Target=T, sample_weight=sample_weight) else: @@ -91,7 +98,7 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None) if self._projection: # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) T_proj = self._model_t_xwz.predict(X, WZ) else: Z_pred = self._model_z_xw.predict(X=X, W=W) @@ -324,19 +331,19 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-4.57086..., 6.06523..., -3.02513...]) + array([-4.49594..., 5.79852..., -2.88049...]) >>> est.effect_interval(X[:3]) - (array([-7.45472..., 1.85334..., -5.47322...]), - array([-1.68700... , 10.27712..., -0.57704...])) + (array([-7.40954..., 1.47475..., -5.32889...]), + array([-1.58235..., 10.12229..., -0.43209...])) >>> est.coef_ - array([ 5.11260... , 0.71353..., 0.38242..., -0.23891..., -0.07036...]) + array([ 5.27614..., 0.92092..., 0.57579..., -0.22810..., -0.16952...]) >>> est.coef__interval() - (array([ 3.76773..., -0.42532..., -0.78145..., -1.36996..., -1.22505...]), - array([6.45747..., 1.85239..., 1.54631..., 0.89213..., 1.08432...])) + (array([ 3.93362..., -0.22159..., -0.59863..., -1.39139..., -1.34549...]), + array([6.61866..., 2.06345..., 1.75022..., 0.93518..., 1.00644...])) >>> est.intercept_ - -0.24090... + -0.29110... >>> est.intercept__interval() - (-1.39053..., 0.90872...) + (-1.45607..., 0.87386...) """ def __init__(self, *, @@ -387,57 +394,29 @@ def _gen_ortho_learner_model_final(self): return _OrthoIVModelFinal(self._gen_model_final(), self._gen_featurizer(), self.fit_cate_intercept) def _gen_ortho_learner_model_nuisance(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) + model_y = _make_first_stage_selector(self.model_y_xw, + is_discrete=False, + random_state=self.random_state) - if self.model_t_xw == 'auto': - if self.discrete_treatment: - model_t_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xw = clone(self.model_t_xw, safe=False) + model_t = _make_first_stage_selector(self.model_t_xw, + is_discrete=self.discrete_treatment, + random_state=self.random_state) if self.projection: # train E[T|X,W,Z] - if self.model_t_xwz == 'auto': - if self.discrete_treatment: - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) - - return _OrthoIVModelNuisance(_FirstStageWrapper(clone(model_y_xw, safe=False), True, - self._gen_featurizer(), False, False), - _FirstStageWrapper(clone(model_t_xw, safe=False), False, - self._gen_featurizer(), False, self.discrete_treatment), - _FirstStageWrapper(clone(model_t_xwz, safe=False), False, - self._gen_featurizer(), False, self.discrete_treatment), - self.projection) + model_z = _make_first_stage_selector(self.model_t_xwz, + is_discrete=self.discrete_treatment, + random_state=self.random_state) else: - # train [Z|X,W] - if self.model_z_xw == "auto": - if self.discrete_instrument: - model_z_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_z_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_z_xw = clone(self.model_z_xw, safe=False) + # train E[Z|X,W] + # note: discrete_instrument rather than discrete_treatment in call to _make_first_stage_selector + model_z = _make_first_stage_selector(self.model_z_xw, + is_discrete=self.discrete_instrument, + random_state=self.random_state) - return _OrthoIVModelNuisance(_FirstStageWrapper(clone(model_y_xw, safe=False), True, - self._gen_featurizer(), False, False), - _FirstStageWrapper(clone(model_t_xw, safe=False), False, - self._gen_featurizer(), False, self.discrete_treatment), - _FirstStageWrapper(clone(model_z_xw, safe=False), False, - self._gen_featurizer(), False, self.discrete_instrument), - self.projection) + return _OrthoIVNuisanceSelector(model_y, model_t, model_z, + self.projection) def fit(self, Y, T, *, Z, X=None, W=None, sample_weight=None, freq_weight=None, sample_var=None, groups=None, cache_values=False, inference="auto"): @@ -604,7 +583,7 @@ def models_y_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_y_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_y_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t_xw(self): @@ -618,7 +597,7 @@ def models_t_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_t_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_z_xw(self): @@ -634,7 +613,7 @@ def models_z_xw(self): """ if self.projection: raise AttributeError("Projection model is fitted for instrument! Use models_t_xwz.") - return [[mdl._model_z_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_z_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t_xwz(self): @@ -650,7 +629,7 @@ def models_t_xwz(self): """ if not self.projection: raise AttributeError("Direct model is fitted for instrument! Use models_z_xw.") - return [[mdl._model_t_xwz._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t_xwz.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def nuisance_scores_y_xw(self): @@ -717,29 +696,24 @@ def residuals_(self): return Y_res, T_res, Z_res, self._cached_values.X, self._cached_values.W, self._cached_values.Z -class _BaseDMLIVModelNuisance: +class _BaseDMLIVNuisanceSelector(ModelSelector): """ Nuisance model fits the three models at fit time and at predict time returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals. """ - def __init__(self, model_y_xw, model_t_xw, model_t_xwz): - self._model_y_xw = clone(model_y_xw, safe=False) - self._model_t_xw = clone(model_t_xw, safe=False) - self._model_t_xwz = clone(model_t_xwz, safe=False) - - def _combine(self, W, Z, n_samples): - if Z is not None: - Z = Z.reshape(n_samples, -1) - return Z if W is None else np.hstack([W, Z]) - return None if W is None else W + def __init__(self, model_y_xw: ModelSelector, model_t_xw: ModelSelector, model_t_xwz: ModelSelector): + self._model_y_xw = model_y_xw + self._model_t_xw = model_t_xw + self._model_t_xwz = model_t_xwz - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): - self._model_y_xw.fit(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) - self._model_t_xw.fit(X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + self._model_y_xw.train(is_selecting, X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + self._model_t_xw.train(is_selecting, X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) - self._model_t_xwz.fit(X, WZ, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) + WZ = _combine(W, Z, Y.shape[0]) + self._model_t_xwz.train(is_selecting, X, WZ, T, + **filter_none_kwargs(sample_weight=sample_weight, groups=groups)) return self def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): @@ -754,7 +728,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): T_X_score = None if hasattr(self._model_t_xwz, 'score'): # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) T_XZ_score = self._model_t_xwz.score(X, WZ, T, **filter_none_kwargs(sample_weight=sample_weight)) else: T_XZ_score = None @@ -764,7 +738,7 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None) # note that sample_weight and groups are not passed to predict because they are only used for fitting Y_pred = self._model_y_xw.predict(X, W) # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) TXZ_pred = self._model_t_xwz.predict(X, WZ) TX_pred = self._model_t_xw.predict(X, W) if (X is None) and (W is None): # In this case predict above returns a single row @@ -909,7 +883,7 @@ def models_y_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_y_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_y_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t_xw(self): @@ -923,7 +897,7 @@ def models_t_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_t_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t_xwz(self): @@ -937,7 +911,7 @@ def models_t_xwz(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_t_xwz._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t_xwz.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def nuisance_scores_y_xw(self): @@ -1139,11 +1113,11 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-4.47392..., 5.74626..., -3.08471...]) + array([-6.83575..., 9.40666..., -4.27123...]) >>> est.coef_ - array([ 5.00993..., 0.86981..., 0.35110..., -0.11390... , -0.17933...]) + array([ 8.07179..., 1.51080..., 0.87328..., -0.06944..., -0.47404...]) >>> est.intercept_ - -0.27719... + -0.20555... """ @@ -1183,42 +1157,19 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y_xw(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) - return _FirstStageWrapper(model_y_xw, True, self._gen_featurizer(), - False, False) + return _make_first_stage_selector(self.model_y_xw, False, self.random_state) def _gen_model_t_xw(self): - if self.model_t_xw == 'auto': - if self.discrete_treatment: - model_t_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xw = clone(self.model_t_xw, safe=False) - return _FirstStageWrapper(model_t_xw, False, self._gen_featurizer(), - False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t_xw, self.discrete_treatment, self.random_state) def _gen_model_t_xwz(self): - if self.model_t_xwz == 'auto': - if self.discrete_treatment: - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) - return _FirstStageWrapper(model_t_xwz, False, self._gen_featurizer(), - False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t_xwz, self.discrete_treatment, self.random_state) def _gen_model_final(self): return clone(self.model_final, safe=False) def _gen_ortho_learner_model_nuisance(self): - return _BaseDMLIVModelNuisance(self._gen_model_y_xw(), self._gen_model_t_xw(), self._gen_model_t_xwz()) + return _BaseDMLIVNuisanceSelector(self._gen_model_y_xw(), self._gen_model_t_xw(), self._gen_model_t_xwz()) def _gen_ortho_learner_model_final(self): return _BaseDMLIVModelFinal(_FinalWrapper(self._gen_model_final(), @@ -1541,7 +1492,7 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-5.52240..., 7.86930..., -3.57966...]) + array([-6.18157..., 8.70189..., -4.06004...]) """ @@ -1579,42 +1530,19 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y_xw(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) - return _FirstStageWrapper(model_y_xw, True, self._gen_featurizer(), - False, False) + return _make_first_stage_selector(self.model_y_xw, False, self.random_state) def _gen_model_t_xw(self): - if self.model_t_xw == 'auto': - if self.discrete_treatment: - model_t_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xw = clone(self.model_t_xw, safe=False) - return _FirstStageWrapper(model_t_xw, False, self._gen_featurizer(), - False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t_xw, self.discrete_treatment, self.random_state) def _gen_model_t_xwz(self): - if self.model_t_xwz == 'auto': - if self.discrete_treatment: - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) - return _FirstStageWrapper(model_t_xwz, False, self._gen_featurizer(), - False, self.discrete_treatment) + return _make_first_stage_selector(self.model_t_xwz, self.discrete_treatment, self.random_state) def _gen_model_final(self): return clone(self.model_final, safe=False) def _gen_ortho_learner_model_nuisance(self): - return _BaseDMLIVModelNuisance(self._gen_model_y_xw(), self._gen_model_t_xw(), self._gen_model_t_xwz()) + return _BaseDMLIVNuisanceSelector(self._gen_model_y_xw(), self._gen_model_t_xw(), self._gen_model_t_xwz()) def _gen_ortho_learner_model_final(self): return _BaseDMLIVModelFinal(_FinalWrapper(self._gen_model_final(), diff --git a/econml/iv/dr/_dr.py b/econml/iv/dr/_dr.py index 5c1df1c94..a72272774 100644 --- a/econml/iv/dr/_dr.py +++ b/econml/iv/dr/_dr.py @@ -27,16 +27,23 @@ LinearCateEstimator) from ...inference import StatsModelsInference from ...sklearn_extensions.linear_model import StatsModelsLinearRegression, DebiasedLasso, WeightedLassoCVWrapper -from ...sklearn_extensions.model_selection import WeightedStratifiedKFold +from ...sklearn_extensions.model_selection import ModelSelector, SingleModelSelector, WeightedStratifiedKFold from ...utilities import (_deprecate_positional, add_intercept, filter_none_kwargs, inverse_onehot, get_feature_names_or_default, check_high_dimensional, check_input_arrays) from ...grf import RegressionForest -from ...dml.dml import _FirstStageWrapper, _FinalWrapper +from ...dml.dml import _make_first_stage_selector, _FinalWrapper from ...iv.dml import NonParamDMLIV from ..._shap import _shap_explain_model_cate -class _BaseDRIVModelNuisance: +def _combine(W, Z, n_samples): + if Z is not None: # Z will not be None + Z = Z.reshape(n_samples, -1) + return Z if W is None else np.hstack([W, Z]) + return None if W is None else W + + +class _BaseDRIVNuisanceSelector(ModelSelector): def __init__(self, *, prel_model_effect, model_y_xw, model_t_xw, model_tz_xw, model_z, projection, fit_cov_directly, discrete_treatment, discrete_instrument): @@ -53,22 +60,30 @@ def __init__(self, *, prel_model_effect, model_y_xw, model_t_xw, model_tz_xw, mo else: self._model_z_xw = model_z - def _combine(self, W, Z, n_samples): - if Z is not None: # Z will not be None - Z = Z.reshape(n_samples, -1) - return Z if W is None else np.hstack([W, Z]) - return None if W is None else W - - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): # T and Z only allow single continuous or binary, keep the shape of (n,) for continuous and (n,1) for binary T = T.ravel() if not self._discrete_treatment else T Z = Z.ravel() if not self._discrete_instrument else Z - self._model_y_xw.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) - self._model_t_xw.fit(X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) + self._model_y_xw.train(is_selecting, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) + self._model_t_xw.train(is_selecting, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) + if is_selecting and self._fit_cov_directly: + # need to fit, too, since we call predict later inside this train method + self._model_t_xw.train(False, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups) + + if self._projection: + WZ = _combine(W, Z, Y.shape[0]) + self._model_t_xwz.train(is_selecting, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) + if is_selecting: + # need to fit, too, since we call predict later inside this train method + self._model_t_xwz.train(False, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) + else: + self._model_z_xw.train(is_selecting, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) + if is_selecting: + # need to fit, too, since we call predict later inside this train method + self._model_z_xw.train(False, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) + if self._projection: - WZ = self._combine(W, Z, Y.shape[0]) - self._model_t_xwz.fit(X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) T_proj = self._model_t_xwz.predict(X, WZ).reshape(T.shape) if self._fit_cov_directly: # We're projecting, so we're treating E[T|X,Z] as the instrument (ignoring W for simplicity) @@ -82,15 +97,14 @@ def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): else: T_pred = T_pred.reshape(T.shape) target = (T_proj - T_pred)**2 - self._model_tz_xw.fit(X=X, W=W, Target=target, - sample_weight=sample_weight, groups=groups) + self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target, + sample_weight=sample_weight, groups=groups) else: # return shape (n,) target = (T * T_proj).reshape(T.shape[0],) - self._model_tz_xw.fit(X=X, W=W, Target=target, - sample_weight=sample_weight, groups=groups) + self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target, + sample_weight=sample_weight, groups=groups) else: - self._model_z_xw.fit(X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) if self._fit_cov_directly: Z_pred = self._model_z_xw.predict(X, W) T_pred = self._model_t_xw.predict(X, W) @@ -111,10 +125,10 @@ def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): target_shape = Z_res.shape if Z_res.ndim > 1 else T_res.shape target = T_res.reshape(target_shape) * Z_res.reshape(target_shape) # TODO: if the T and Z models overfit, then this will be biased towards 0; - # consider using nested cross-fitting here + # consider using nested cross-fitting # a similar comment applies to the projection case - self._model_tz_xw.fit(X=X, W=W, Target=target, - sample_weight=sample_weight, groups=groups) + self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target, + sample_weight=sample_weight, groups=groups) else: if self._discrete_treatment: if self._discrete_instrument: @@ -130,8 +144,8 @@ def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): else: # shape(n,) target = T * Z - self._model_tz_xw.fit(X=X, W=W, Target=target, - sample_weight=sample_weight, groups=groups) + self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target, + sample_weight=sample_weight, groups=groups) # TODO: prel_model_effect could allow sample_var and freq_weight? if self._discrete_instrument: @@ -168,7 +182,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): if self._projection: if hasattr(self._model_t_xwz, 'score'): - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) t_xwz_score = self._model_t_xwz.score(X=X, W=WZ, Target=T, sample_weight=sample_weight) else: t_xwz_score = None @@ -232,7 +246,7 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None) if self._projection: # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) T_proj = self._model_t_xwz.predict(X, WZ).reshape(T.shape) Z_res = T_proj - T_pred if self._fit_cov_directly: @@ -650,86 +664,40 @@ def _gen_prel_model_effect(self): return clone(self.prel_model_effect, safe=False) def _gen_ortho_learner_model_nuisance(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) - - if self.model_t_xw == 'auto': - if self.discrete_treatment: - model_t_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xw = clone(self.model_t_xw, safe=False) + model_y_xw = _make_first_stage_selector(self.model_y_xw, False, self.random_state) + model_t_xw = _make_first_stage_selector(self.model_t_xw, self.discrete_treatment, self.random_state) if self.projection: - # this is a regression model since proj_t is probability - if self.model_tz_xw == "auto": - model_tz_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_tz_xw = clone(self.model_tz_xw, safe=False) + # this is a regression model since the instrument E[T|X,W,Z] is always continuous + model_tz_xw = _make_first_stage_selector(self.model_tz_xw, + is_discrete=False, + random_state=self.random_state) - if self.model_t_xwz == 'auto': - if self.discrete_treatment: - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) - - return _BaseDRIVModelNuisance(prel_model_effect=self._gen_prel_model_effect(), - model_y_xw=_FirstStageWrapper( - model_y_xw, True, self._gen_featurizer(), False, False), - model_t_xw=_FirstStageWrapper(model_t_xw, False, self._gen_featurizer(), - False, self.discrete_treatment), - # outcome is continuous since proj_t is probability - model_tz_xw=_FirstStageWrapper(model_tz_xw, False, self._gen_featurizer(), - False, False), - model_z=_FirstStageWrapper(model_t_xwz, False, self._gen_featurizer(), - False, self.discrete_treatment), - projection=self.projection, - fit_cov_directly=self.fit_cov_directly, - discrete_treatment=self.discrete_treatment, - discrete_instrument=self.discrete_instrument) + # we're using E[T|X,W,Z] as the instrument + model_z = _make_first_stage_selector(self.model_t_xwz, + is_discrete=self.discrete_treatment, + random_state=self.random_state) else: - if self.model_tz_xw == "auto": - if self.discrete_treatment and self.discrete_instrument and not self.fit_cov_directly: - model_tz_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_tz_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_tz_xw = clone(self.model_tz_xw, safe=False) - - if self.model_z_xw == 'auto': - if self.discrete_instrument: - model_z_xw = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_z_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_z_xw = clone(self.model_z_xw, safe=False) - - return _BaseDRIVModelNuisance(prel_model_effect=self._gen_prel_model_effect(), - model_y_xw=_FirstStageWrapper( - model_y_xw, True, self._gen_featurizer(), False, False), - model_t_xw=_FirstStageWrapper(model_t_xw, False, self._gen_featurizer(), - False, self.discrete_treatment), - model_tz_xw=_FirstStageWrapper(model_tz_xw, False, self._gen_featurizer(), - False, (self.discrete_treatment and - self.discrete_instrument and - not self.fit_cov_directly)), - model_z=_FirstStageWrapper(model_z_xw, False, self._gen_featurizer(), - False, (self.discrete_instrument and - not self.fit_cov_directly)), - projection=self.projection, - fit_cov_directly=self.fit_cov_directly, - discrete_treatment=self.discrete_treatment, - discrete_instrument=self.discrete_instrument) + model_tz_xw = _make_first_stage_selector(self.model_tz_xw, + is_discrete=(self.discrete_treatment and + self.discrete_instrument and + not self.fit_cov_directly), + random_state=self.random_state) + + model_z = _make_first_stage_selector(self.model_z_xw, + is_discrete=self.discrete_instrument, + random_state=self.random_state) + + return _BaseDRIVNuisanceSelector(prel_model_effect=self._gen_prel_model_effect(), + model_y_xw=model_y_xw, + model_t_xw=model_t_xw, + model_tz_xw=model_tz_xw, + model_z=model_z, + projection=self.projection, + fit_cov_directly=self.fit_cov_directly, + discrete_treatment=self.discrete_treatment, + discrete_instrument=self.discrete_instrument) class DRIV(_DRIV): @@ -915,7 +883,7 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-1.71678..., -0.27824..., -3.18333...]) + array([-4.07330..., 6.01693..., -2.71813...]) """ def __init__(self, *, @@ -1090,7 +1058,7 @@ def models_y_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_y_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_y_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t_xw(self): @@ -1104,7 +1072,7 @@ def models_t_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_t_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_z_xw(self): @@ -1120,7 +1088,7 @@ def models_z_xw(self): """ if self.projection: raise AttributeError("Projection model is fitted for instrument! Use models_t_xwz.") - return [[mdl._model_z_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_z_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_t_xwz(self): @@ -1136,7 +1104,7 @@ def models_t_xwz(self): """ if not self.projection: raise AttributeError("Direct model is fitted for instrument! Use models_z_xw.") - return [[mdl._model_t_xwz._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_t_xwz.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_tz_xw(self): @@ -1150,7 +1118,7 @@ def models_tz_xw(self): iterations, each element in the sublist corresponds to a crossfitting fold and is the model instance that was fitted for that training fold. """ - return [[mdl._model_tz_xw._model for mdl in mdls] for mdls in super().models_nuisance_] + return [[mdl._model_tz_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_] @property def models_prel_model_effect(self): @@ -1396,19 +1364,19 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-0.54223..., 0.77763..., -2.01011...]) + array([-4.29809..., 5.94280..., -3.00977...]) >>> est.effect_interval(X[:3]) - (array([-4.73213..., -5.57270..., -5.84891...]), - array([3.64765..., 7.12797..., 1.82868...])) + (array([-7.09165..., 1.79692..., -5.46033...]), + array([-1.50452..., 10.08868..., -0.55922...])) >>> est.coef_ - array([ 3.12341..., 1.78962..., -0.45351..., -0.41677..., 0.93306...]) + array([ 4.84900..., 0.82084..., 0.24269..., -0.04771..., -0.29325...]) >>> est.coef__interval() - (array([ 1.36498..., 0.00496..., -2.28573..., -2.02274..., -0.94000...]), - array([4.88184..., 3.57428..., 1.37869..., 1.18919..., 2.80614...])) + (array([ 3.67882..., -0.35547..., -0.97063..., -1.15410..., -1.50482...]), + array([6.01917..., 1.99716..., 1.45603..., 1.05867..., 0.91831...])) >>> est.intercept_ - 1.10417... + -0.16276... >>> est.intercept__interval() - (-0.65690..., 2.86525...) + (-1.32713..., 1.00160...) """ def __init__(self, *, @@ -1747,19 +1715,19 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-0.68659..., 1.03696..., -2.10343...]) + array([-4.26791..., 5.98882..., -3.02154...]) >>> est.effect_interval(X[:3]) - (array([-4.92102..., -4.99359..., -5.79899...]), - array([3.54783..., 7.06753..., 1.59212...])) + (array([-7.06828..., 2.00060..., -5.46554...]), + array([-1.46754..., 9.97704..., -0.57754...])) >>> est.coef_ - array([ 3.18552..., 1.83651..., -0.47721..., -0.28640... , 0.87765...]) + array([ 4.84189..., 0.81844... , 0.20681..., -0.04660..., -0.28790...]) >>> est.coef__interval() - (array([ 1.43299..., 0.06316..., -2.28671..., -2.01185..., -0.93582...]), - array([4.93805..., 3.60987..., 1.33227... , 1.43904..., 2.69114...])) + (array([ 3.68288..., -0.35434..., -0.98986..., -1.18770..., -1.48722...]), + array([6.00090..., 1.99122..., 1.40349..., 1.09449..., 0.91141...])) >>> est.intercept_ - 1.15151... + -0.12298... >>> est.intercept__interval() - (-0.60109..., 2.90411...) + (-1.28204..., 1.03607...) """ def __init__(self, *, @@ -2186,10 +2154,10 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-1.74672..., 1.57..., -1.58916...]) + array([-1.60489..., 5.40611..., -3.46904...]) >>> est.effect_interval(X[:3]) - (array([-7.05230..., -6..., -5.11344...]), - array([3.55885..., 9.9..., 1.93512...])) + (array([-5.37171..., 0.73055..., -7.15266...]), + array([ 2.16192..., 10.08168..., 0.21457...])) """ def __init__(self, *, @@ -2342,25 +2310,23 @@ def model_final(self, model): raise ValueError("Parameter `model_final` cannot be altered for this estimator!") -class _IntentToTreatDRIVModelNuisance: - def __init__(self, model_y_xw, model_t_xwz, dummy_z, prel_model_effect): - self._model_y_xw = clone(model_y_xw, safe=False) - self._model_t_xwz = clone(model_t_xwz, safe=False) - self._dummy_z = clone(dummy_z, safe=False) - self._prel_model_effect = clone(prel_model_effect, safe=False) - - def _combine(self, W, Z, n_samples): - if Z is not None: # Z will not be None - Z = Z.reshape(n_samples, -1) - return Z if W is None else np.hstack([W, Z]) - return None if W is None else W +class _IntentToTreatDRIVNuisanceSelector(ModelSelector): + def __init__(self, + model_y_xw: SingleModelSelector, + model_t_xwz: SingleModelSelector, + dummy_z: SingleModelSelector, + prel_model_effect): + self._model_y_xw = model_y_xw + self._model_t_xwz = model_t_xwz + self._dummy_z = dummy_z + self._prel_model_effect = prel_model_effect - def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): - self._model_y_xw.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) + def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): + self._model_y_xw.train(is_selecting, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups) # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) - self._model_t_xwz.fit(X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) - self._dummy_z.fit(X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) + WZ = _combine(W, Z, Y.shape[0]) + self._model_t_xwz.train(is_selecting, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups) + self._dummy_z.train(is_selecting, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups) # we need to undo the one-hot encoding for calling effect, # since it expects raw values self._prel_model_effect.fit(Y, inverse_onehot(T), Z=inverse_onehot(Z), X=X, W=W, @@ -2374,7 +2340,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): Y_X_score = None if hasattr(self._model_t_xwz, 'score'): # concat W and Z - WZ = self._combine(W, Z, Y.shape[0]) + WZ = _combine(W, Z, Y.shape[0]) T_XZ_score = self._model_t_xwz.score(X=X, W=WZ, Target=T, sample_weight=sample_weight) else: T_XZ_score = None @@ -2390,8 +2356,8 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None): Y_pred = self._model_y_xw.predict(X, W) - T_pred_zero = self._model_t_xwz.predict(X, self._combine(W, np.zeros(Z.shape), Y.shape[0])) - T_pred_one = self._model_t_xwz.predict(X, self._combine(W, np.ones(Z.shape), Y.shape[0])) + T_pred_zero = self._model_t_xwz.predict(X, _combine(W, np.zeros(Z.shape), Y.shape[0])) + T_pred_one = self._model_t_xwz.predict(X, _combine(W, np.ones(Z.shape), Y.shape[0])) Z_pred = self._dummy_z.predict(X, W) prel_theta = self._prel_model_effect.effect(X) @@ -2486,16 +2452,8 @@ def _gen_prel_model_effect(self): return clone(self.prel_model_effect, safe=False) def _gen_ortho_learner_model_nuisance(self): - if self.model_y_xw == 'auto': - model_y_xw = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y_xw = clone(self.model_y_xw, safe=False) - - if self.model_t_xwz == 'auto': - model_t_xwz = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t_xwz = clone(self.model_t_xwz, safe=False) + model_y_xw = _make_first_stage_selector(self.model_y_xw, is_discrete=False, random_state=self.random_state) + model_t_xwz = _make_first_stage_selector(self.model_t_xwz, is_discrete=True, random_state=self.random_state) if self.z_propensity == "auto": dummy_z = DummyClassifier(strategy="prior") @@ -2504,14 +2462,9 @@ def _gen_ortho_learner_model_nuisance(self): else: raise ValueError("Only 'auto' or float is allowed!") - return _IntentToTreatDRIVModelNuisance(_FirstStageWrapper(model_y_xw, True, self._gen_featurizer(), - False, False), - _FirstStageWrapper(model_t_xwz, False, - self._gen_featurizer(), False, True), - _FirstStageWrapper(dummy_z, False, - self._gen_featurizer(), False, True), - self._gen_prel_model_effect() - ) + dummy_z = _make_first_stage_selector(dummy_z, is_discrete=True, random_state=self.random_state) + + return _IntentToTreatDRIVNuisanceSelector(model_y_xw, model_t_xwz, dummy_z, self._gen_prel_model_effect()) class _DummyCATE: @@ -2674,7 +2627,7 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-4.29282..., 6.08590..., -2.11608...]) + array([-3.71724..., 6.39915..., -2.14545...]) """ def __init__(self, *, @@ -2968,19 +2921,19 @@ def true_heterogeneity_function(X): est.fit(Y=y, T=T, Z=Z, X=X) >>> est.effect(X[:3]) - array([-4.81123..., 5.65430..., -2.63204...]) + array([-4.05294..., 6.44603..., -2.49535...]) >>> est.effect_interval(X[:3]) - (array([-8.42669..., 0.36538... , -5.82840...]), - array([-1.19578... , 10.94323..., 0.56430...])) + (array([-8.42902..., 0.05595..., -6.34202...]), + array([ 0.32313..., 12.83612..., 1.35131...])) >>> est.coef_ - array([ 5.01936..., 0.71988..., 0.82603..., -0.08192... , -0.02520...]) + array([ 4.99132..., 0.35043..., 0.41963..., -0.63553..., -0.33972...]) >>> est.coef__interval() - (array([ 3.52057... , -0.72550..., -0.72653..., -1.50040... , -1.52896...]), - array([6.51816..., 2.16527..., 2.37861..., 1.33656..., 1.47854...])) + (array([ 3.11828..., -1.44768..., -1.46377..., -2.36080..., -2.18746...]), + array([6.86435..., 2.14856..., 2.30303..., 1.08973..., 1.50802...])) >>> est.intercept_ - -0.45176... + -0.25633... >>> est.intercept__interval() - (-1.93313..., 1.02959...) + (-2.07961..., 1.56695...) """ def __init__(self, *, diff --git a/econml/panel/dml/_dml.py b/econml/panel/dml/_dml.py index c3dc96a4e..8fe01b6d8 100644 --- a/econml/panel/dml/_dml.py +++ b/econml/panel/dml/_dml.py @@ -9,13 +9,13 @@ from scipy.stats import norm from sklearn.linear_model import (ElasticNetCV, LassoCV, LogisticRegressionCV) from ...sklearn_extensions.linear_model import (StatsModelsLinearRegression, WeightedLassoCVWrapper) -from ...sklearn_extensions.model_selection import WeightedStratifiedKFold -from ...dml.dml import _FirstStageWrapper, _FinalWrapper +from ...sklearn_extensions.model_selection import ModelSelector, WeightedStratifiedKFold +from ...dml.dml import _make_first_stage_selector, _FinalWrapper from ..._cate_estimator import TreatmentExpansionMixin, LinearModelFinalCateEstimatorMixin from ..._ortho_learner import _OrthoLearner from ...utilities import (_deprecate_positional, add_intercept, broadcast_unit_treatments, check_high_dimensional, - cross_product, deprecated, fit_with_groups, + cross_product, deprecated, hstack, inverse_onehot, ndim, reshape, reshape_treatmentwise_effects, shape, transpose, get_feature_names_or_default, check_input_arrays, @@ -33,7 +33,7 @@ def _get_groups_period_filter(groups, n_periods): return group_period_filter -class _DynamicModelNuisance: +class _DynamicModelNuisanceSelector(ModelSelector): """ Nuisance model fits the model_y and model_t at fit time and at predict time calculates the residual Y and residual T based on the fitted models and returns @@ -45,21 +45,27 @@ def __init__(self, model_y, model_t, n_periods): self._model_t = model_t self.n_periods = n_periods - def fit(self, Y, T, X=None, W=None, sample_weight=None, groups=None): + def train(self, is_selecting, Y, T, X=None, W=None, sample_weight=None, groups=None): """Fit a series of nuisance models for each period or period pairs.""" assert Y.shape[0] % self.n_periods == 0, \ "Length of training data should be an integer multiple of time periods." period_filters = _get_groups_period_filter(groups, self.n_periods) - self._model_y_trained = {} - self._model_t_trained = {j: {} for j in np.arange(self.n_periods)} + if is_selecting: # create the per-period y and t models + self._model_y_trained = {t: clone(self._model_y, safe=False) + for t in np.arange(self.n_periods)} + self._model_t_trained = {j: {t: clone(self._model_t, safe=False) + for t in np.arange(j + 1)} + for j in np.arange(self.n_periods)} for t in np.arange(self.n_periods): - self._model_y_trained[t] = clone(self._model_y, safe=False).fit( + self._model_y_trained[t].train( + is_selecting, self._index_or_None(X, period_filters[t]), self._index_or_None( W, period_filters[t]), Y[period_filters[self.n_periods - 1]]) for j in np.arange(t, self.n_periods): - self._model_t_trained[j][t] = clone(self._model_t, safe=False).fit( + self._model_t_trained[j][t].train( + is_selecting, self._index_or_None(X, period_filters[t]), self._index_or_None(W, period_filters[t]), T[period_filters[j]]) @@ -428,33 +434,33 @@ class DynamicDML(LinearModelFinalCateEstimatorMixin, _OrthoLearner): est.fit(y, T, X=X, W=None, groups=groups, inference="auto") >>> est.const_marginal_effect(X[:2]) - array([[-0.336..., -0.048..., -0.061..., 0.042..., -0.204..., - 0.00667271], - [-0.101..., 0.433..., 0.054..., -0.217..., -0.101..., - -0.159...]]) + array([[-0.345..., -0.056..., -0.044..., 0.046..., -0.202..., + 0.023...], + [-0.120..., 0.434..., 0.052..., -0.201..., -0.115..., + -0.134...]]) >>> est.effect(X[:2], T0=0, T1=1) - array([-0.601..., -0.091...]) + array([-0.579..., -0.085...]) >>> est.effect(X[:2], T0=np.zeros((2, n_periods*T.shape[1])), T1=np.ones((2, n_periods*T.shape[1]))) - array([-0.601..., -0.091...]) + array([-0.579..., -0.085...]) >>> est.coef_ - array([[ 0.112...], - [ 0.231...], - [ 0.055...], - [-0.125...], - [ 0.049...], - [-0.079...]]) + array([[ 0.108...], + [ 0.235...], + [ 0.046...], + [-0.119...], + [ 0.042...], + [-0.075...]]) >>> est.coef__interval() - (array([[-0.063...], - [-0.009...], - [-0.114...], - [-0.413...], - [-0.117...], - [-0.262...]]), array([[0.289...], - [0.471...], - [0.225...], - [0.163...], - [0.216...], - [0.103...]])) + (array([[-0.042...], + [-0.001...], + [-0.120...], + [-0.393...], + [-0.120...], + [-0.256...]]), array([[0.258...], + [0.473...], + [0.212...], + [0.154...], + [0.204...], + [0.104...]])) """ def __init__(self, *, @@ -534,30 +540,18 @@ def _gen_featurizer(self): return clone(self.featurizer, safe=False) def _gen_model_y(self): - if self.model_y == 'auto': - model_y = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_y = clone(self.model_y, safe=False) - return _FirstStageWrapper(model_y, True, self._gen_featurizer(), - self.linear_first_stages, self.discrete_treatment) + return _make_first_stage_selector(self.model_y, is_discrete=False, random_state=self.random_state) def _gen_model_t(self): - if self.model_t == 'auto': - if self.discrete_treatment: - model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state), - random_state=self.random_state) - else: - model_t = WeightedLassoCVWrapper(random_state=self.random_state) - else: - model_t = clone(self.model_t, safe=False) - return _FirstStageWrapper(model_t, False, self._gen_featurizer(), - self.linear_first_stages, self.discrete_treatment) + return _make_first_stage_selector(self.model_t, + is_discrete=self.discrete_treatment, + random_state=self.random_state) def _gen_model_final(self): return StatsModelsLinearRegression(fit_intercept=False) def _gen_ortho_learner_model_nuisance(self): - return _DynamicModelNuisance( + return _DynamicModelNuisanceSelector( model_t=self._gen_model_t(), model_y=self._gen_model_y(), n_periods=self._n_periods) diff --git a/econml/sklearn_extensions/linear_model.py b/econml/sklearn_extensions/linear_model.py index 0c90c6868..7c29cbd70 100644 --- a/econml/sklearn_extensions/linear_model.py +++ b/econml/sklearn_extensions/linear_model.py @@ -20,8 +20,7 @@ import warnings from collections.abc import Iterable from scipy.stats import norm -from econml.sklearn_extensions.model_selection import WeightedKFold, WeightedStratifiedKFold -from econml.utilities import ndim, shape, reshape, _safe_norm_ppf, check_input_arrays +from ..utilities import ndim, shape, reshape, _safe_norm_ppf, check_input_arrays from sklearn import clone from sklearn.linear_model import LinearRegression, LassoCV, MultiTaskLassoCV, Lasso, MultiTaskLasso from sklearn.linear_model._base import _preprocess_data @@ -41,7 +40,24 @@ from typing import List +class _WeightedCVIterableWrapper(_CVIterableWrapper): + def __init__(self, cv): + super().__init__(cv) + + def get_n_splits(self, X=None, y=None, groups=None, sample_weight=None): + if groups is not None and sample_weight is not None: + raise ValueError("Cannot simultaneously use grouping and weighting") + return super().get_n_splits(X, y, groups) + + def split(self, X=None, y=None, groups=None, sample_weight=None): + if groups is not None and sample_weight is not None: + raise ValueError("Cannot simultaneously use grouping and weighting") + return super().split(X, y, groups) + + def _weighted_check_cv(cv=5, y=None, classifier=False, random_state=None): + # local import to avoid circular imports + from .model_selection import WeightedKFold, WeightedStratifiedKFold cv = 5 if cv is None else cv if isinstance(cv, numbers.Integral): if (classifier and (y is not None) and @@ -60,21 +76,6 @@ def _weighted_check_cv(cv=5, y=None, classifier=False, random_state=None): return cv # New style cv objects are passed without any modification -class _WeightedCVIterableWrapper(_CVIterableWrapper): - def __init__(self, cv): - super().__init__(cv) - - def get_n_splits(self, X=None, y=None, groups=None, sample_weight=None): - if groups is not None and sample_weight is not None: - raise ValueError("Cannot simultaneously use grouping and weighting") - return super().get_n_splits(X, y, groups) - - def split(self, X=None, y=None, groups=None, sample_weight=None): - if groups is not None and sample_weight is not None: - raise ValueError("Cannot simultaneously use grouping and weighting") - return super().split(X, y, groups) - - class WeightedModelMixin: """Mixin class for weighted models. @@ -1204,73 +1205,91 @@ def _set_attribute(self, attribute_name, condition=True, default=None): setattr(self, attribute_name, attribute_value) -class WeightedLassoCVWrapper: - """Helper class to wrap either WeightedLassoCV or WeightedMultiTaskLassoCV depending on the shape of the target.""" +class _PairedEstimatorWrapper: + """Helper class to wrap two different estimators, one of which can be used only with single targets and the other + which can be used on multiple targets. Not intended to be used directly by users.""" + + _SingleEst = None + _MultiEst = None + _known_params = [] + _post_fit_attrs = [] def __init__(self, *args, **kwargs): self.args = args self.kwargs = kwargs - # set model to WeightedLassoCV by default so there's always a model to get and set attributes on - self.model = WeightedLassoCV(*args, **kwargs) - - # whitelist known params because full set is not necessarily identical between LassoCV and MultiTaskLassoCV - # (e.g. former has 'positive' and 'precompute' while latter does not) - known_params = set(['eps', 'n_alphas', 'alphas', 'fit_intercept', 'normalize', 'max_iter', 'tol', 'copy_X', - 'cv', 'verbose', 'n_jobs', 'random_state', 'selection']) + # set model to the single-target estimator by default so there's always a model to get and set attributes on + self.model = self._SingleEst(*args, **kwargs) def fit(self, X, y, sample_weight=None): - self.needs_unravel = False + self._needs_unravel = False params = {key: value for (key, value) in self.get_params().items() - if key in self.known_params} + if key in self._known_params} if ndim(y) == 2 and shape(y)[1] > 1: - self.model = WeightedMultiTaskLassoCV(**params) + self.model = self._MultiEst(**params) else: if ndim(y) == 2 and shape(y)[1] == 1: y = np.ravel(y) - self.needs_unravel = True - self.model = WeightedLassoCV(**params) + self._needs_unravel = True + self.model = self._SingleEst(**params) self.model.fit(X, y, sample_weight) - # set intercept_ attribute - self.intercept_ = self.model.intercept_ - # set coef_ attribute - self.coef_ = self.model.coef_ - # set alpha_ attribute - self.alpha_ = self.model.alpha_ - # set alphas_ attribute - self.alphas_ = self.model.alphas_ - # set n_iter_ attribute - self.n_iter_ = self.model.n_iter_ + for param in self._post_fit_attrs: + setattr(self, param, getattr(self.model, param)) return self def predict(self, X): predictions = self.model.predict(X) - return reshape(predictions, (-1, 1)) if self.needs_unravel else predictions + return reshape(predictions, (-1, 1)) if self._needs_unravel else predictions def score(self, X, y, sample_weight=None): return self.model.score(X, y, sample_weight) def __getattr__(self, key): - if key in self.known_params: + if key in self._known_params: return getattr(self.model, key) else: raise AttributeError("No attribute " + key) def __setattr__(self, key, value): - if key in self.known_params: + if key in self._known_params: setattr(self.model, key, value) else: super().__setattr__(key, value) def get_params(self, deep=True): """Get parameters for this estimator.""" - return self.model.get_params(deep=deep) + return {k: v for k, v in self.model.get_params(deep=deep).items() if k in self._known_params} def set_params(self, **params): """Set parameters for this estimator.""" self.model.set_params(**params) +class WeightedLassoCVWrapper(_PairedEstimatorWrapper): + """Helper class to wrap either WeightedLassoCV or WeightedMultiTaskLassoCV depending on the shape of the target.""" + + _SingleEst = WeightedLassoCV + _MultiEst = WeightedMultiTaskLassoCV + + # whitelist known params because full set is not necessarily identical between LassoCV and MultiTaskLassoCV + # (e.g. former has 'positive' and 'precompute' while latter does not) + _known_params = set(['eps', 'n_alphas', 'alphas', 'fit_intercept', 'normalize', 'max_iter', 'tol', 'copy_X', + 'cv', 'verbose', 'n_jobs', 'random_state', 'selection']) + + _post_fit_attrs = set(['alpha_', 'alphas_', 'coef_', 'dual_gap_', + 'intercept_', 'n_iter_', 'n_features_in_', 'mse_path_']) + + +class WeightedLassoWrapper(_PairedEstimatorWrapper): + """Helper class to wrap either WeightedLasso or WeightedMultiTaskLasso depending on the shape of the target.""" + + _SingleEst = WeightedLasso + _MultiEst = WeightedMultiTaskLasso + _known_params = set(['alpha', 'fit_intercept', 'copy_X', 'max_iter', 'tol', + 'random_state', 'selection']) + _post_fit_attrs = set(['coef_', 'dual_gap_', 'intercept_', 'n_iter_', 'n_features_in_']) + + class SelectiveRegularization: """ Estimator of a linear model where regularization is applied to only a subset of the coefficients. diff --git a/econml/sklearn_extensions/model_selection.py b/econml/sklearn_extensions/model_selection.py index 79b714bbc..4b1456d51 100644 --- a/econml/sklearn_extensions/model_selection.py +++ b/econml/sklearn_extensions/model_selection.py @@ -1,25 +1,35 @@ # Copyright (c) PyWhy contributors. All rights reserved. # Licensed under the MIT License. - """Collection of scikit-learn extensions for model selection techniques.""" import numbers import warnings -import sklearn -from sklearn.base import BaseEstimator -from sklearn.utils.multiclass import type_of_target +import abc + import numpy as np +from collections.abc import Iterable import scipy.sparse as sp +import sklearn from joblib import Parallel, delayed -from sklearn.base import clone, is_classifier -from sklearn.model_selection import KFold, StratifiedKFold, check_cv, GridSearchCV +from sklearn.base import BaseEstimator, clone, is_classifier +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.exceptions import FitFailedWarning +from sklearn.linear_model import (ElasticNet, ElasticNetCV, Lasso, LassoCV, MultiTaskElasticNet, MultiTaskElasticNetCV, + MultiTaskLasso, MultiTaskLassoCV, Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV, + LogisticRegression, LogisticRegressionCV) +from sklearn.model_selection import (BaseCrossValidator, GridSearchCV, GroupKFold, KFold, + RandomizedSearchCV, StratifiedKFold, + check_cv) # TODO: conisder working around relying on sklearn implementation details from sklearn.model_selection._validation import (_check_is_permutation, _fit_and_predict) -from sklearn.preprocessing import LabelEncoder -from sklearn.utils import indexable, check_random_state +from sklearn.preprocessing import LabelEncoder, StandardScaler +from sklearn.utils import check_random_state, indexable +from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _num_samples +from .linear_model import WeightedLassoCVWrapper, WeightedLassoWrapper + def _split_weighted_sample(self, X, y, sample_weight, is_stratified=False): random_state = self.random_state if self.shuffle else None @@ -256,6 +266,320 @@ def get_n_splits(self, X, y, groups=None): return self.n_splits +class ModelSelector(metaclass=abc.ABCMeta): + """ + This class enables a two-stage fitting process, where first a model is selected + by calling `train` with `is_selecting=True`, and then the selected model is fit (presumably + on a different data set) by calling train with `is_selecting=False`. + + + """ + + @abc.abstractmethod + def train(self, is_selecting: bool, *args, **kwargs): + """ + Either selects a model or fits a model, depending on the value of `is_selecting`. + """ + raise NotImplementedError("Abstract method") + + @abc.abstractmethod + def predict(self, *args, **kwargs): + """ + Predicts using the selected model; should not be called until after `train` has been used + both to select a model and to fit it. + """ + raise NotImplementedError("Abstract method") + + @abc.abstractmethod + def score(self, *args, **kwargs): + """ + Gets the score of the selected model on the given data; should not be called until after `train` has been used + both to select a model and to fit it. + """ + raise NotImplementedError("Abstract method") + + +class SingleModelSelector(ModelSelector): + """ + A model selection class that selects a single best model; + this encompasses random search, grid search, ensembling, etc. + """ + + @property + @abc.abstractmethod + def best_model(self): + raise NotImplementedError("Abstract method") + + @property + @abc.abstractmethod + def best_score(self): + raise NotImplementedError("Abstract method") + + def predict(self, *args, **kwargs): + return self.best_model.predict(*args, **kwargs) + + def predict_proba(self, *args, **kwargs): + return self.best_model.predict_proba(*args, **kwargs) + + def score(self, *args, **kwargs): + if hasattr(self.best_model, 'score'): + return self.best_model.score(*args, **kwargs) + else: + return None + + +def _fit_with_groups(model, X, y, *, groups, **kwargs): + """ + Fits a model while correctly handling grouping if necessary. + + This enables us to perform an inner-loop cross-validation of a model + which handles grouping correctly, which is not easy using typical sklearn models. + + For example, GridSearchCV and RandomSearchCV both support passing `groups` to fit, + but other CV-related estimators (e.g. LassoCV) do not, which means that GroupKFold + cannot be used as the cv instance, because the `groups` argument will never be passed through + to GroupKFold's `split` method. + + The hacky workaround here is to explicitly set the `cv` attribute to the set of + rows that GroupKFold would have generated rather than using GroupKFold as the cv instance. + """ + if groups is not None: + if hasattr(model, 'cv'): + old_cv = model.cv + # logic copied from check_cv + cv = 5 if old_cv is None else old_cv + if isinstance(cv, numbers.Integral): + cv = GroupKFold(cv) + # otherwise we will assume the user already set the cv attribute to something + # compatible with splitting with a `groups` argument + + splits = list(cv.split(X, y, groups=groups)) + try: + model.cv = splits + return model.fit(X, y, **kwargs) # drop groups from arg list + finally: + model.cv = old_cv + + # drop groups from arg list, which were already used at the outer level and may not be supported by the model + return model.fit(X, y, **kwargs) + + +class FixedModelSelector(SingleModelSelector): + """ + Model selection class that always selects the given model + """ + + def __init__(self, model): + self.model = clone(model, safe=False) + + def train(self, is_selecting, *args, groups=None, **kwargs): + # whether selecting or not, need to train the model on the data + _fit_with_groups(self.model, *args, groups=groups, **kwargs) + if is_selecting and hasattr(self.model, 'score'): + # TODO: we need to alter this to use out-of-sample score here, which + # will require cross-validation, but should respect grouping, stratifying, etc. + self._score = self.model.score(*args, **kwargs) + return self + + @property + def best_model(self): + return self.model + + @property + def best_score(self): + return self._score + + +def _copy_to(m1, m2, attrs, insert_underscore=False): + for attr in attrs: + setattr(m2, attr, getattr(m1, attr + "_" if insert_underscore else attr)) + + +def _convert_linear_model(model, new_cls, extra_attrs=[]): + new_model = new_cls() + # copy common parameters + _copy_to(model, new_model, ["fit_intercept", "max_iter", + "tol", + "random_state"]) + # copy common fitted variables + _copy_to(model, new_model, ["coef_", "intercept_", "n_features_in_", "n_iter_"]) + # copy attributes unique to this class + _copy_to(model, new_model, extra_attrs) + return new_model + + +def _to_logisticRegression(model: LogisticRegressionCV): + lr = _convert_linear_model(model, LogisticRegression) + _copy_to(model, lr, ["penalty", "dual", "intercept_scaling", + "class_weight", + "solver", "multi_class", + "verbose", "n_jobs"]) + _copy_to(model, lr, ["classes_"]) + + _copy_to(model, lr, ["C", "l1_ratio"], True) # these are arrays in LogisticRegressionCV, need to convert them next + + # make sure all classes agree on best c/l1 combo + assert np.isclose(lr.C, lr.C.flatten()[0]).all() + assert np.equal(lr.l1_ratio, None).all() or np.isclose(lr.l1_ratio, lr.l1_ratio.flatten()[0]).all() + lr.C = lr.C[0] + lr.l1_ratio = lr.l1_ratio[0] + avg_scores = np.average([v for k, v in model.scores_.items()], axis=1) # average over folds + best_scores = np.max(avg_scores, axis=tuple(range(1, avg_scores.ndim))) # average score of best c/l1 combo + assert np.isclose(best_scores, best_scores.flatten()[0]).all() # make sure all folds agree on best c/l1 combo + return lr, best_scores[0] + + +def _convert_linear_regression(model, new_cls, extra_attrs=["positive"]): + new_model = _convert_linear_model(model, new_cls, ["copy_X", + "n_iter_"]) + _copy_to(model, new_model, ["alpha"], True) + return new_model + + +def _to_elasticNet(model: ElasticNetCV, is_lasso=False, cls=None, extra_attrs=[]): + cls = cls or (Lasso if is_lasso else ElasticNet) + new_model = _convert_linear_regression(model, cls, extra_attrs + ['selection', 'warm_start', + 'dual_gap_']) + if not is_lasso: + # l1 ratio doesn't apply to Lasso, only ElasticNet + _copy_to(model, new_model, ["l1_ratio"], True) + max_score = np.max(np.mean(model.mse_path_, axis=-1)) # last dimension in mse_path is folds, so average over that + return new_model, max_score + + +def _to_ridge(model, cls=Ridge, extra_attrs=["positive"]): + ridge = _convert_linear_regression(model, cls, extra_attrs + ["_normalize", "solver"]) + best_score = model.best_score_ + return ridge, best_score + + +class SklearnCVSelector(SingleModelSelector): + """ + Wraps one of sklearn's CV classes in the ModelSelector interface + """ + + def __init__(self, searcher): + self.searcher = clone(searcher) + + @staticmethod + def convertible_types(): + return {GridSearchCV, RandomizedSearchCV} | SklearnCVSelector._model_mapping().keys() + + @staticmethod + def can_wrap(model): + return any(isinstance(model, model_type) for model_type in SklearnCVSelector.convertible_types()) + + @staticmethod + def _model_mapping(): + return {LogisticRegressionCV: _to_logisticRegression, + ElasticNetCV: _to_elasticNet, + LassoCV: lambda model: _to_elasticNet(model, True, None, ["positive"]), + RidgeCV: _to_ridge, + RidgeClassifierCV: lambda model: _to_ridge(model, RidgeClassifier, ["positive", "class_weight", + "_label_binarizer"]), + MultiTaskElasticNetCV: lambda model: _to_elasticNet(model, False, MultiTaskElasticNet, extra_attrs=[]), + MultiTaskLassoCV: lambda model: _to_elasticNet(model, True, MultiTaskLasso, extra_attrs=[]), + WeightedLassoCVWrapper: lambda model: _to_elasticNet(model, True, WeightedLassoWrapper, + extra_attrs=[]), + } + + def train(self, is_selecting: bool, *args, groups=None, **kwargs): + if is_selecting: + _fit_with_groups(self.searcher, *args, groups=groups, **kwargs) + + if isinstance(self.searcher, GridSearchCV) or isinstance(self.searcher, RandomizedSearchCV): + self._best_model = self.searcher.best_estimator_ + self._best_score = self.searcher.best_score_ + + for known_type in self._model_mapping().keys(): + if isinstance(self.searcher, known_type): + converter = self._model_mapping()[known_type] + self._best_model, self._best_score = converter(self.searcher) + return self + + else: + # don't need to use _fit_with_groups here since none of these models support it + self.best_model.fit(*args, **kwargs) + return self + + @property + def best_model(self): + return self._best_model + + @property + def best_score(self): + return self._best_score + + +class ListSelector(SingleModelSelector): + """ + Model selection class that selects the best model from a list of model selectors + + Parameters + ---------- + models : list of ModelSelector + The list of model selectors to choose from + unwrap : bool, default True + Whether to return the best model's best model, rather than just the outer best model selector + """ + + def __init__(self, models, unwrap=True): + self.models = [clone(model, safe=False) for model in models] + self.unwrap = unwrap + + def train(self, is_selecting, *args, **kwargs): + if is_selecting: + scores = [] + for model in self.models: + model.train(is_selecting, *args, **kwargs) + scores.append(model.best_score) + self._all_scores = scores + self._best_score = np.max(scores) + self._best_model = self.models[np.argmax(scores)] + + else: + self._best_model.train(is_selecting, *args, **kwargs) + + @property + def best_model(self): + """ + Gets the best model; note that if we were selecting over SingleModelSelectors and `unwrap` is `False`, + we will return the SingleModelSelector instance, not its best model. + """ + return self._best_model.best_model if self.unwrap else self._best_model + + @property + def best_score(self): + return self._best_score + + +def get_selector(input, is_discrete, *, random_state=None, cv=None, wrapper=GridSearchCV): + named_models = { + 'linear': (LogisticRegressionCV(random_state=random_state, cv=cv) if is_discrete + else WeightedLassoCVWrapper(random_state=random_state, cv=cv)), + 'forest': (GridSearchCV(RandomForestClassifier(random_state=random_state) if is_discrete + else RandomForestRegressor(random_state=random_state), + param_grid={}, cv=cv)), + } + if isinstance(input, ModelSelector): # we've already got a model selector, don't need to do anything + return input + elif isinstance(input, list): # we've got a list; call get_selector on each element, then wrap in a ListSelector + models = [get_selector(model, is_discrete, + random_state=random_state, cv=cv, wrapper=wrapper) + for model in input] + return ListSelector(models) + elif isinstance(input, str): # we've got a string; look it up + if input in named_models: + return get_selector(named_models[input], is_discrete, + random_state=random_state, cv=cv, wrapper=wrapper) + else: + raise ValueError(f"Unknown model type: {input}, must be one of {named_models.keys()}") + elif SklearnCVSelector.can_wrap(input): + return SklearnCVSelector(input) + else: # assume this is an sklearn-compatible model + return FixedModelSelector(input) + + class GridSearchCVList(BaseEstimator): """ An extension of GridSearchCV that allows for passing a list of estimators each with their own parameter grid and returns the best among all estimators in the list and hyperparameter in their diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index 195b4615d..57f6e9051 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -159,7 +159,7 @@ def make_random(n, is_discrete, d): True, ['auto']), (LinearDML(model_y=Lasso(), - model_t='auto', + model_t=model_t, featurizer=featurizer, fit_cate_intercept=fit_cate_intercept, discrete_treatment=is_discrete, @@ -1012,15 +1012,14 @@ def prediction_stderr(self, X): assert dml.marginal_effect_interval(1) == (1, 1) def test_sparse(self): - for _ in range(5): - # Ensure reproducibility - np.random.seed(1234) - n_p = np.random.randint(2, 5) # 2 to 4 products - d_w = np.random.randint(0, 5) # random number of covariates - min_n = np.ceil(2 + d_w * (1 + (d_w + 1) / n_p)) # minimum number of rows per product - n_r = np.random.randint(min_n, min_n + 3) - with self.subTest(n_p=n_p, d_w=d_w, n_r=n_r): - TestDML._test_sparse(n_p, d_w, n_r) + # Ensure reproducibility + np.random.seed(123) + n_p = np.random.randint(2, 5) # 2 to 4 products + d_w = np.random.randint(0, 5) # random number of covariates + min_n = np.ceil(2 + d_w * (1 + (d_w + 1) / n_p)) # minimum number of rows per product + n_r = np.random.randint(min_n, min_n + 3) + with self.subTest(n_p=n_p, d_w=d_w, n_r=n_r): + TestDML._test_sparse(n_p, d_w, n_r) def test_linear_sparse(self): """SparseDML test with a sparse DGP""" @@ -1051,7 +1050,10 @@ def test_linear_sparse(self): Y = T * (x @ a) + xw @ g + err_Y # Test sparse estimator # --> test coef_, intercept_ - sparse_dml = SparseLinearDML(fit_cate_intercept=False) + # with this DGP, since T depends linearly on X, Y depends on X quadratically + # so we should use a quadratic featurizer + sparse_dml = SparseLinearDML(fit_cate_intercept=False, model_y=Pipeline([('poly', PolynomialFeatures(2)), + ('lr', LassoCV())])) sparse_dml.fit(Y, T, X=x, W=w) np.testing.assert_allclose(a, sparse_dml.coef_, atol=2e-1) with pytest.raises(AttributeError): @@ -1125,11 +1127,12 @@ def _test_sparse(n_p, d_w, n_r): y[fold * n:(fold + 1) * n] = y_f t[fold * n:(fold + 1) * n] = t_f - dml = SparseLinearDML(model_y=LinearRegression(fit_intercept=False), + # we have quadratic terms in y, so we need to pipeline with a quadratic featurizer + dml = SparseLinearDML(model_y=Pipeline([('poly', PolynomialFeatures(2)), + ('lr', LinearRegression(fit_intercept=False))]), model_t=LinearRegression(fit_intercept=False), fit_cate_intercept=False) dml.fit(y, t, X=x, W=w) - np.testing.assert_allclose(a, dml.coef_.reshape(-1), atol=1e-1) eff = reshape(t * np.choose(np.tile(p, 2), a), (-1,)) np.testing.assert_allclose(eff, dml.effect(x, T0=0, T1=t), atol=1e-1) @@ -1237,8 +1240,8 @@ def test_groups(self): # test outer grouping # with 2 folds, we should get exactly 3 groups per split, each with 10 copies of the y or t value - est = LinearDML(model_y=GroupingModel(LinearRegression(), (3, 3), n_copies), - model_t=GroupingModel(LinearRegression(), (3, 3), n_copies)) + est = LinearDML(model_y=GroupingModel(LinearRegression(), 60, (3, 3), n_copies), + model_t=GroupingModel(LinearRegression(), 60, (3, 3), n_copies)) est.fit(y, t, groups=groups) # test nested grouping @@ -1246,17 +1249,10 @@ def test_groups(self): # with 2-fold outer and 2-fold inner grouping, and six total groups, # should get 1 or 2 groups per split - est = LinearDML(model_y=NestedModel(LassoCV(cv=2), (1, 2), n_copies), - model_t=NestedModel(LassoCV(cv=2), (1, 2), n_copies)) + est = LinearDML(model_y=NestedModel(LassoCV(cv=2), 60, (1, 2), n_copies), + model_t=NestedModel(LassoCV(cv=2), 60, (1, 2), n_copies)) est.fit(y, t, groups=groups) - # by default, we use 5 split cross-validation for our T and Y models - # but we don't have enough groups here to split both the outer and inner samples with grouping - # TODO: does this imply we should change some defaults to make this more likely to succeed? - est = LinearDML(model_y=LassoCV(cv=5), model_t=LassoCV(cv=5)) - with pytest.raises(Exception): - est.fit(y, t, groups=groups) - def test_treatment_names(self): Y = np.random.normal(size=(100, 1)) T = np.random.binomial(n=1, p=0.5, size=(100, 1)) diff --git a/econml/tests/test_dmliv.py b/econml/tests/test_dmliv.py index f52c14356..e2dc6c21e 100644 --- a/econml/tests/test_dmliv.py +++ b/econml/tests/test_dmliv.py @@ -8,12 +8,12 @@ import pytest from scipy import special from sklearn.ensemble import RandomForestRegressor -from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV from sklearn.preprocessing import PolynomialFeatures from econml.iv.dml import OrthoIV, DMLIV, NonParamDMLIV from econml.iv.dr._dr import _DummyCATE -from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression +from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression, WeightedLassoCVWrapper from econml.utilities import shape from econml.tests.utilities import GroupingModel @@ -62,26 +62,40 @@ def eff_shape(n, d_x, d_y): None, PolynomialFeatures(degree=2, include_bias=False), ]: + # since we're running so many combinations, just use LassoCV/LogisticRegressionCV + # for the models instead of also selecting over random forest models est_list = [ OrthoIV( + model_y_xw=WeightedLassoCVWrapper(), + model_t_xw=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), + model_z_xw=LogisticRegressionCV() if binary_Z else WeightedLassoCVWrapper(), projection=False, featurizer=featurizer, discrete_treatment=binary_T, discrete_instrument=binary_Z, ), OrthoIV( + model_y_xw=WeightedLassoCVWrapper(), + model_t_xw=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), + model_t_xwz=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), projection=True, featurizer=featurizer, discrete_treatment=binary_T, discrete_instrument=binary_Z, ), DMLIV( + model_y_xw=WeightedLassoCVWrapper(), + model_t_xw=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), + model_t_xwz=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), model_final=LinearRegression(fit_intercept=False), featurizer=featurizer, discrete_treatment=binary_T, discrete_instrument=binary_Z, ), NonParamDMLIV( + model_y_xw=WeightedLassoCVWrapper(), + model_t_xw=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), + model_t_xwz=LogisticRegressionCV() if binary_T else WeightedLassoCVWrapper(), model_final=RandomForestRegressor(), featurizer=featurizer, discrete_treatment=binary_T, @@ -207,7 +221,7 @@ def test_groups(self): projection=False, discrete_treatment=True, discrete_instrument=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims, n_copies), model_t_xw=LogisticRegression(), model_z_xw=LogisticRegression(), ), @@ -215,7 +229,7 @@ def test_groups(self): projection=True, discrete_treatment=True, discrete_instrument=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims, n_copies), model_t_xw=LogisticRegression(), model_t_xwz=LogisticRegression(), ), @@ -223,7 +237,7 @@ def test_groups(self): model_final=LinearRegression(fit_intercept=False), discrete_treatment=True, discrete_instrument=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims, n_copies), model_t_xw=LogisticRegression(), model_t_xwz=LogisticRegression(), ), @@ -231,7 +245,7 @@ def test_groups(self): model_final=RandomForestRegressor(), discrete_treatment=True, discrete_instrument=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims, n_copies), model_t_xw=LogisticRegression(), model_t_xwz=LogisticRegression(), ), diff --git a/econml/tests/test_driv.py b/econml/tests/test_driv.py index 39b90c1ed..9ff237c47 100644 --- a/econml/tests/test_driv.py +++ b/econml/tests/test_driv.py @@ -13,7 +13,7 @@ import pickle from scipy import special from sklearn.preprocessing import PolynomialFeatures -from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegression import unittest try: @@ -74,7 +74,14 @@ def eff_shape(n, d_x): Z = np.random.normal(size=(n,)) est_list = [ + # we're running a lot of tests, so use fixed models instead of model selection DRIV( + model_y_xw=LinearRegression(), + model_t_xw=LogisticRegression() if binary_T else LinearRegression(), + model_tz_xw=LogisticRegression() if binary_T and binary_Z and not ( + projection or fit_cov_directly) else LinearRegression(), + model_t_xwz="auto" if not projection else LogisticRegression() if binary_T else LinearRegression(), + model_z_xw="auto" if projection else LogisticRegression() if binary_Z else LinearRegression(), flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), model_final=StatsModelsLinearRegression( fit_intercept=False @@ -88,6 +95,12 @@ def eff_shape(n, d_x): use_ray=use_ray, ), LinearDRIV( + model_y_xw=LinearRegression(), + model_t_xw=LogisticRegression() if binary_T else LinearRegression(), + model_tz_xw=LogisticRegression() if binary_T and binary_Z and not ( + projection or fit_cov_directly) else LinearRegression(), + model_t_xwz="auto" if not projection else LogisticRegression() if binary_T else LinearRegression(), + model_z_xw="auto" if projection else LogisticRegression() if binary_Z else LinearRegression(), flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), fit_cate_intercept=True, projection=projection, @@ -98,6 +111,12 @@ def eff_shape(n, d_x): use_ray=use_ray, ), SparseLinearDRIV( + model_y_xw=LinearRegression(), + model_t_xw=LogisticRegression() if binary_T else LinearRegression(), + model_tz_xw=LogisticRegression() if binary_T and binary_Z and not ( + projection or fit_cov_directly) else LinearRegression(), + model_t_xwz="auto" if not projection else LogisticRegression() if binary_T else LinearRegression(), + model_z_xw="auto" if projection else LogisticRegression() if binary_Z else LinearRegression(), flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), fit_cate_intercept=True, projection=projection, @@ -108,6 +127,12 @@ def eff_shape(n, d_x): use_ray=use_ray, ), ForestDRIV( + model_y_xw=LinearRegression(), + model_t_xw=LogisticRegression() if binary_T else LinearRegression(), + model_tz_xw=LogisticRegression() if binary_T and binary_Z and not ( + projection or fit_cov_directly) else LinearRegression(), + model_t_xwz="auto" if not projection else LogisticRegression() if binary_T else LinearRegression(), + model_z_xw="auto" if projection else LogisticRegression() if binary_Z else LinearRegression(), flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), projection=projection, fit_cov_directly=fit_cov_directly, @@ -125,6 +150,8 @@ def eff_shape(n, d_x): if binary_T and binary_Z and not fit_cov_directly: est_list += [ IntentToTreatDRIV( + model_y_xw=LinearRegression(), + model_t_xwz=LogisticRegression(), flexible_model_effect=StatsModelsLinearRegression( fit_intercept=False ), @@ -133,6 +160,8 @@ def eff_shape(n, d_x): use_ray=use_ray, ), LinearIntentToTreatDRIV( + model_y_xw=LinearRegression(), + model_t_xwz=LogisticRegression(), flexible_model_effect=StatsModelsLinearRegression( fit_intercept=False ), @@ -207,7 +236,7 @@ def test_cate_api_without_ray(self): self._test_cate_api(use_ray=False) def _test_accuracy(self, use_ray=False): - np.random.seed(42) + np.random.seed(0) # dgp (binary T, binary Z) @@ -281,7 +310,10 @@ def test_accuracy_without_ray(self): def test_fit_cov_directly(self): # fitting the covariance directly should be at least as good as computing the covariance from separate models - est = LinearDRIV() + + # set the models so that model selection over random forests doesn't take too much time in the repeated trials + est = LinearDRIV(model_y_xw=LinearRegression(), model_t_xw=LinearRegression(), model_z_xw=LinearRegression(), + model_tz_xw=LassoCV()) n = 500 p = 10 @@ -334,8 +366,8 @@ def ceil(a, b): # ceiling analog of // DRIV( discrete_instrument=True, discrete_treatment=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims_2, n_copies), - model_z_xw=LinearRegression(), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_2, n_copies), + model_z_xw=LogisticRegression(), model_t_xw=LogisticRegression(), model_tz_xw=LinearRegression(), model_t_xwz=LogisticRegression(), @@ -344,8 +376,8 @@ def ceil(a, b): # ceiling analog of // LinearDRIV( discrete_instrument=True, discrete_treatment=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims_2, n_copies), - model_z_xw=LinearRegression(), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_2, n_copies), + model_z_xw=LogisticRegression(), model_t_xw=LogisticRegression(), model_tz_xw=LinearRegression(), model_t_xwz=LogisticRegression(), @@ -354,8 +386,8 @@ def ceil(a, b): # ceiling analog of // SparseLinearDRIV( discrete_instrument=True, discrete_treatment=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims_2, n_copies), - model_z_xw=LinearRegression(), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_2, n_copies), + model_z_xw=LogisticRegression(), model_t_xw=LogisticRegression(), model_tz_xw=LinearRegression(), model_t_xwz=LogisticRegression(), @@ -364,20 +396,20 @@ def ceil(a, b): # ceiling analog of // ForestDRIV( discrete_instrument=True, discrete_treatment=True, - model_y_xw=GroupingModel(LinearRegression(), ct_lims_2, n_copies), - model_z_xw=LinearRegression(), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_2, n_copies), + model_z_xw=LogisticRegression(), model_t_xw=LogisticRegression(), model_tz_xw=LinearRegression(), model_t_xwz=LogisticRegression(), prel_cate_approach='dmliv' ), IntentToTreatDRIV( - model_y_xw=GroupingModel(LinearRegression(), ct_lims_3, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_3, n_copies), model_t_xwz=LogisticRegression(), prel_cate_approach='dmliv' ), LinearIntentToTreatDRIV( - model_y_xw=GroupingModel(LinearRegression(), ct_lims_3, n_copies), + model_y_xw=GroupingModel(LinearRegression(), n, ct_lims_3, n_copies), model_t_xwz=LogisticRegression(), prel_cate_approach='dmliv' ) diff --git a/econml/tests/test_drlearner.py b/econml/tests/test_drlearner.py index f6a5e4ae8..a02aea617 100644 --- a/econml/tests/test_drlearner.py +++ b/econml/tests/test_drlearner.py @@ -765,6 +765,7 @@ def test_DRLearner(self): outcome_model = Pipeline( [('poly', PolynomialFeatures()), ('model', LinearRegression())]) DR_learner = DRLearner(model_regression=outcome_model, + model_propensity=LogisticRegressionCV(), model_final=LinearRegression()) self._test_te(DR_learner, tol=0.5, te_type="heterogeneous") # Test heterogenous treatment effect for W =/= None @@ -828,26 +829,17 @@ def test_groups(self): # cross-fit generate one est = LinearDRLearner(model_propensity=LogisticRegression(), # with 2-fold grouping, we should get exactly 3 groups per split - model_regression=GroupingModel(LinearRegression(), (3, 3), n_copies), + model_regression=GroupingModel(LinearRegression(), 60, (3, 3), n_copies), cv=StratifiedGroupKFold(2)) est.fit(y, t, W=w, groups=groups) # test nested grouping est = LinearDRLearner(model_propensity=LogisticRegression(), # with 2-fold outer and 2-fold inner grouping, we should get 1-2 groups per split - model_regression=NestedModel(LassoCV(cv=2), (1, 2), n_copies), + model_regression=NestedModel(LassoCV(cv=2), 60, (1, 2), n_copies), cv=StratifiedGroupKFold(2)) est.fit(y, t, W=w, groups=groups) - # by default, we use 5 split cross-validation for our T and Y models - # but we don't have enough groups here to split both the outer and inner samples with grouping - # TODO: does this imply we should change some defaults to make this more likely to succeed? - est = LinearDRLearner(model_propensity=LogisticRegressionCV(cv=5), - model_regression=LassoCV(cv=5), - cv=StratifiedGroupKFold(2)) - with pytest.raises(Exception): - est.fit(y, t, W=w, groups=groups) - def test_score(self): """Test that scores are the same no matter whether the prediction of cate model has the same shape of input or the shape of input.reshape(-1,1).""" diff --git a/econml/tests/test_missing_values.py b/econml/tests/test_missing_values.py index 66d917f76..eb1c4f7e4 100644 --- a/econml/tests/test_missing_values.py +++ b/econml/tests/test_missing_values.py @@ -27,7 +27,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self diff --git a/econml/tests/test_ortho_learner.py b/econml/tests/test_ortho_learner.py index 66c389ae0..142f23563 100644 --- a/econml/tests/test_ortho_learner.py +++ b/econml/tests/test_ortho_learner.py @@ -29,7 +29,7 @@ class Wrapper: def __init__(self, model): self._model = model - def fit(self, X, y, Q, W=None): + def train(self, is_selecting, X, y, Q, W=None): self._model.fit(X, y) return self @@ -109,7 +109,7 @@ class Wrapper: def __init__(self, model): self._model = model - def fit(self, X, y, W=None): + def train(self, is_selecting, X, y, W=None): self._model.fit(X, y) return self @@ -179,7 +179,7 @@ class Wrapper: def __init__(self, model): self._model = model - def fit(self, X, y, Q, W=None): + def train(self, is_selecting, X, y, Q, W=None): self._model.fit(X, y) return self @@ -219,7 +219,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self @@ -331,7 +331,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self @@ -378,7 +378,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, T) self._model_y.fit(W, Y) return self @@ -434,7 +434,7 @@ def __init__(self, model_t, model_y): self._model_t = model_t self._model_y = model_y - def fit(self, Y, T, W=None): + def train(self, is_selecting, Y, T, W=None): self._model_t.fit(W, np.matmul(T, np.arange(1, T.shape[1] + 1))) self._model_y.fit(W, Y) return self diff --git a/econml/tests/test_refit.py b/econml/tests/test_refit.py index 9cf93334c..00cc81dff 100644 --- a/econml/tests/test_refit.py +++ b/econml/tests/test_refit.py @@ -188,9 +188,9 @@ def test_orthoiv(self): est.model_t_xw = ElasticNet() est.model_z_xw = WeightedLasso() est.fit(y, T, Z=Z, W=W, cache_values=True) - assert isinstance(est.models_nuisance_[0][0]._model_y_xw._model, Lasso) - assert isinstance(est.models_nuisance_[0][0]._model_t_xw._model, ElasticNet) - assert isinstance(est.models_nuisance_[0][0]._model_z_xw._model, WeightedLasso) + assert isinstance(est.models_y_xw[0][0], Lasso) + assert isinstance(est.models_t_xw[0][0], ElasticNet) + assert isinstance(est.models_z_xw[0][0], WeightedLasso) est = DMLIV(model_y_xw=LinearRegression(), model_t_xw=LinearRegression(), @@ -202,9 +202,9 @@ def test_orthoiv(self): est.model_t_xw = ElasticNet() est.model_t_xwz = WeightedLasso() est.fit(y, T, Z=Z, X=X, W=W, cache_values=True) - assert isinstance(est.models_nuisance_[0][0]._model_y_xw._model, Lasso) - assert isinstance(est.models_nuisance_[0][0]._model_t_xw._model, ElasticNet) - assert isinstance(est.models_nuisance_[0][0]._model_t_xwz._model, WeightedLasso) + assert isinstance(est.models_y_xw[0][0], Lasso) + assert isinstance(est.models_t_xw[0][0], ElasticNet) + assert isinstance(est.models_t_xwz[0][0], WeightedLasso) est = NonParamDMLIV(model_y_xw=LinearRegression(), model_t_xw=LinearRegression(), diff --git a/econml/tests/test_treatment_featurization.py b/econml/tests/test_treatment_featurization.py index 834da1ffe..a5b825253 100644 --- a/econml/tests/test_treatment_featurization.py +++ b/econml/tests/test_treatment_featurization.py @@ -4,7 +4,7 @@ import unittest import numpy as np from sklearn.preprocessing import PolynomialFeatures -from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegression from sklearn.ensemble import RandomForestRegressor from joblib import Parallel, delayed @@ -14,7 +14,7 @@ from econml.iv.dr import DRIV, LinearDRIV, SparseLinearDRIV, ForestDRIV from econml.orf import DMLOrthoForest from sklearn.preprocessing import OneHotEncoder, FunctionTransformer -from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression +from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression, WeightedLassoCVWrapper from econml.utilities import jacify_featurizer from econml.iv.sieve import DPolynomialFeatures @@ -200,6 +200,25 @@ def sum_squeeze_func_transform(x): class TestTreatmentFeaturization(unittest.TestCase): def test_featurization(self): + # use LassoCV rather than also selecting over RandomForests to save time + dml_models = { + "model_t": WeightedLassoCVWrapper(), + "model_y": WeightedLassoCVWrapper() + } + + dmliv_models = { + "model_y_xw": WeightedLassoCVWrapper(), + "model_t_xw": WeightedLassoCVWrapper(), + "model_t_xwz": WeightedLassoCVWrapper(), + } + + driv_models = { + "model_y_xw": WeightedLassoCVWrapper(), + "model_t_xw": WeightedLassoCVWrapper(), + "model_z_xw": WeightedLassoCVWrapper(), + "model_tz_xw": WeightedLassoCVWrapper(), + } + identity_config = { 'DGP_params': { 'n': 2000, @@ -223,10 +242,10 @@ def test_featurization(self): 'squeeze_Ts': [False, True], 'squeeze_Ys': [False, True], 'est_dicts': [ - {'class': LinearDML, 'init_args': {}}, - {'class': CausalForestDML, 'init_args': {}}, - {'class': SparseLinearDML, 'init_args': {}}, - {'class': KernelDML, 'init_args': {}}, + {'class': LinearDML, 'init_args': dml_models}, + {'class': CausalForestDML, 'init_args': dml_models}, + {'class': SparseLinearDML, 'init_args': dml_models}, + {'class': KernelDML, 'init_args': dml_models}, ] } @@ -253,10 +272,10 @@ def test_featurization(self): 'squeeze_Ts': [False, True], 'squeeze_Ys': [False, True], 'est_dicts': [ - {'class': LinearDML, 'init_args': {}}, - {'class': CausalForestDML, 'init_args': {}}, - {'class': SparseLinearDML, 'init_args': {}}, - {'class': KernelDML, 'init_args': {}}, + {'class': LinearDML, 'init_args': dml_models}, + {'class': CausalForestDML, 'init_args': dml_models}, + {'class': SparseLinearDML, 'init_args': dml_models}, + {'class': KernelDML, 'init_args': dml_models}, ] } @@ -268,9 +287,11 @@ def test_featurization(self): poly_IV_config['DGP_params']['d_z'] = 1 poly_IV_config['DGP_params']['nuisance_TZ'] = lambda Z: Z poly_IV_config['est_dicts'] = [ - {'class': OrthoIV, 'init_args': { - 'model_t_xwz': RandomForestRegressor(random_state=1), 'projection': True}}, - {'class': DMLIV, 'init_args': {'model_t_xwz': RandomForestRegressor(random_state=1)}}, + {'class': OrthoIV, 'init_args': {**dmliv_models, + 'model_t_xwz': RandomForestRegressor(random_state=1), + 'projection': True}}, + {'class': DMLIV, 'init_args': {**dmliv_models, + 'model_t_xwz': RandomForestRegressor(random_state=1)}}, ] poly_1d_config = deepcopy(poly_config) @@ -287,11 +308,13 @@ def test_featurization(self): poly_1d_IV_config['treatment_featurizer'] = polynomial_1d_treatment_featurizer poly_1d_IV_config['actual_cme'] = poly_1d_actual_cme poly_1d_IV_config['est_dicts'] = [ - {'class': NonParamDMLIV, 'init_args': {'model_final': StatsModelsLinearRegression()}}, - {'class': DRIV, 'init_args': {'fit_cate_intercept': True}}, - {'class': LinearDRIV, 'init_args': {}}, - {'class': SparseLinearDRIV, 'init_args': {}}, - {'class': ForestDRIV, 'init_args': {}}, + {'class': NonParamDMLIV, 'init_args': {**dmliv_models, + 'model_final': StatsModelsLinearRegression()}}, + {'class': DRIV, 'init_args': {**driv_models, + 'fit_cate_intercept': True}}, + {'class': LinearDRIV, 'init_args': driv_models}, + {'class': SparseLinearDRIV, 'init_args': driv_models}, + {'class': ForestDRIV, 'init_args': driv_models}, ] sum_IV_config = { @@ -319,11 +342,13 @@ def test_featurization(self): 'squeeze_Ts': [False], 'squeeze_Ys': [False, True], 'est_dicts': [ - {'class': NonParamDMLIV, 'init_args': {'model_final': StatsModelsLinearRegression()}}, - {'class': DRIV, 'init_args': {'fit_cate_intercept': True}}, - {'class': LinearDRIV, 'init_args': {}}, - {'class': SparseLinearDRIV, 'init_args': {}}, - {'class': ForestDRIV, 'init_args': {}}, + {'class': NonParamDMLIV, 'init_args': {**dmliv_models, + 'model_final': StatsModelsLinearRegression()}}, + {'class': DRIV, 'init_args': {**driv_models, + 'fit_cate_intercept': True}}, + {'class': LinearDRIV, 'init_args': driv_models}, + {'class': SparseLinearDRIV, 'init_args': driv_models}, + {'class': ForestDRIV, 'init_args': driv_models}, ] } diff --git a/econml/tests/utilities.py b/econml/tests/utilities.py index 4c04cc89d..1c11be343 100644 --- a/econml/tests/utilities.py +++ b/econml/tests/utilities.py @@ -16,15 +16,17 @@ class GroupingModel: and the number of copies of each y value should be equal to the group size """ - def __init__(self, model, limits, n_copies): + def __init__(self, model, total, limits, n_copies): self.model = model + self.total = total self.limits = limits self.n_copies = n_copies - def validate(self, y): + def validate(self, y, skip_group_counts=False): (yvals, cts) = np.unique(y, return_counts=True) (llim, ulim) = self.limits - if not (llim <= len(yvals) <= ulim): + # if we aren't fitting on the whole dataset, ensure that the limits are respected + if (not skip_group_counts) and (not (llim <= len(yvals) <= ulim)): raise Exception(f"Grouping failed: received {len(yvals)} groups instead of {llim}-{ulim}") # ensure that the grouping has worked correctly and we get exactly the number of copies @@ -35,7 +37,7 @@ def validate(self, y): f"Grouping failed; received {ct} copies of {yval} instead of {self.n_copies[yval]}") def fit(self, X, y): - self.validate(y) + self.validate(y, len(y) == self.total) self.model.fit(X, y) return self @@ -46,12 +48,9 @@ def predict(self, X): class NestedModel(GroupingModel): """ Class for testing nested grouping. The wrapped model must have a 'cv' attribute; - this class exposes an identical 'cv' attribute, which is how nested CV is implemented in fit_with_groups + this class exposes an identical 'cv' attribute, which is how nested CV is implemented in _fit_with_groups """ - def __init__(self, model, limits, n_copies): - super().__init__(model, limits, n_copies) - # DML nested CV works via a 'cv' attribute @property def cv(self): @@ -64,6 +63,6 @@ def cv(self, value): def fit(self, X, y): for (train, test) in check_cv(self.cv, y).split(X, y): # want to validate the nested grouping, not the outer grouping in the nesting tests - self.validate(y[train]) + self.validate(y[train], len(y) == self.total) self.model.fit(X, y) return self diff --git a/econml/utilities.py b/econml/utilities.py index 84c577a93..f62ffbb4d 100644 --- a/econml/utilities.py +++ b/econml/utilities.py @@ -21,7 +21,6 @@ from sklearn.preprocessing import PolynomialFeatures import warnings from warnings import warn -from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold from collections.abc import Iterable from sklearn.utils.multiclass import type_of_target import numbers @@ -919,62 +918,6 @@ def filter_inds(coords, data, n): [arrs[indMap[c][0][0]].shape[indMap[c][0][1]] for c in outputs]) -def fit_with_groups(model, X, y, groups=None, **kwargs): - """ - Fit a model while correctly handling grouping if necessary. - - This enables us to perform an inner-loop cross-validation of a model - which handles grouping correctly, which is not easy using typical sklearn models. - - For example, GridSearchCV and RandomSearchCV both support passing 'groups' to fit, - but other CV-related estimators (such as those derived from LinearModelCV, including LassoCV), - do not support passing groups to fit which meanst that GroupKFold cannot be used as the cv instance - when using these types, because the required 'groups' argument will never be passed to the - GroupKFold's split method. See also https://github.com/scikit-learn/scikit-learn/issues/12052 - - The (hacky) workaround that is used here is to explicitly set the 'cv' attribute (if there is one) to - the exact set of rows and not to use GroupKFold even with the sklearn classes that could support it; - this should work with classes derived from BaseSearchCV, LinearModelCV, and CalibratedClassifierCV. - - Parameters - ---------- - model : estimator - The model to fit - X : array_like - The features to fit against - y : array_like - The target to fit against - groups : array_like, optional - The set of groupings that should be kept together when splitting rows for - cross-validation - kwargs : dict - Any other named arguments to pass to the model's fit - """ - - if groups is not None: - # assume that we should perform nested cross-validation if and only if - # the model has a 'cv' attribute; this is a somewhat brittle assumption... - if hasattr(model, 'cv'): - old_cv = model.cv - # logic copied from check_cv - cv = 5 if old_cv is None else old_cv - if isinstance(cv, numbers.Integral): - cv = GroupKFold(cv) - # otherwise we will assume the user already set the cv attribute to something - # compatible with splitting with a 'groups' argument - - # now we have to compute the folds explicitly because some classifiers (like LassoCV) - # don't use the groups when calling split internally - splits = list(cv.split(X, y, groups=groups)) - try: - model.cv = splits - return model.fit(X, y, **kwargs) - finally: - model.cv = old_cv - - return model.fit(X, y, **kwargs) - - def filter_none_kwargs(**kwargs): """ Filters out any keyword arguments that are None. diff --git a/notebooks/Scaling EconML using Ray.ipynb b/notebooks/Scaling EconML using Ray.ipynb index 49c9a44fa..217bd122a 100644 --- a/notebooks/Scaling EconML using Ray.ipynb +++ b/notebooks/Scaling EconML using Ray.ipynb @@ -35,11 +35,11 @@ "execution_count": 4, "id": "01b70101-d4ad-40fc-baa6-565795ee897a", "metadata": { - "tags": [], "ExecuteTime": { "end_time": "2023-08-16T18:32:09.629351Z", "start_time": "2023-08-16T18:32:09.627091Z" - } + }, + "tags": [] }, "outputs": [], "source": [ @@ -47,9 +47,8 @@ "import os\n", "import numpy as np\n", "import scipy\n", - "from econml.dml import DML\n", + "from econml.dml import LinearDML\n", "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", - "from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] @@ -125,9 +124,10 @@ "outputs": [], "source": [ "np.random.seed(123)\n", - "X = np.random.normal(size=(10000, 5))\n", + "n = 5000\n", + "X = np.random.normal(size=(n, 5))\n", "T = np.random.binomial(1, scipy.special.expit(X[:, 0]))\n", - "y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(10000,))" + "y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(n,))" ] }, { @@ -171,11 +171,9 @@ "\n", "ray_opts = {'num_cpus':2,'scheduling_strategy':'SPREAD'}\n", "\n", - "est = DML(\n", + "est = LinearDML(\n", " model_y=RandomForestRegressor(random_state=0),\n", " model_t=RandomForestClassifier(random_state=0),\n", - " model_final=StatsModelsLinearRegression(fit_intercept=False),\n", - " linear_first_stages=False,\n", " discrete_treatment=True,\n", " use_ray=True, #setting use_ray flag to True to use ray.\n", " ray_remote_func_options=ray_opts,\n", @@ -217,15 +215,13 @@ " runtimes = []\n", " for cv in cv_values:\n", " ray_opts = {'num_cpus': 2, 'scheduling_strategy': 'SPREAD'} if use_ray else None\n", - " est = DML(model_y=RandomForestRegressor(random_state=0),\n", - " model_t=RandomForestClassifier(random_state=0),\n", - " model_final=StatsModelsLinearRegression(fit_intercept=False),\n", - " linear_first_stages=False,\n", - " discrete_treatment=True,\n", - " use_ray=use_ray,\n", - " ray_remote_func_options=ray_opts,\n", - " cv=cv,\n", - " mc_iters=1)\n", + " est = LinearDML(model_y=RandomForestRegressor(random_state=0),\n", + " model_t=RandomForestClassifier(random_state=0),\n", + " discrete_treatment=True,\n", + " use_ray=use_ray,\n", + " ray_remote_func_options=ray_opts,\n", + " cv=cv,\n", + " mc_iters=1)\n", " \n", " start_time = time.time()\n", " est.fit(y, T, X=X, W=None)\n", @@ -296,9 +292,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file