Skip to content

Commit

Permalink
Final commit, added encoding for categorical data (untested) and adde…
Browse files Browse the repository at this point in the history
…d notebook to showcase some of the functionality

Signed-off-by: AnthonyCampbell208 <[email protected]>
  • Loading branch information
AnthonyCampbell208 authored and kbattocchi committed Aug 18, 2023
1 parent 4971259 commit a96601a
Show file tree
Hide file tree
Showing 4 changed files with 1,090 additions and 25 deletions.
14 changes: 8 additions & 6 deletions econml/dml/dml.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,8 @@ def __init__(self, *,
model_y, model_t, model_final,
param_list_y=None,
param_list_t=None,
scoring_y=None,
scoring_t=None,
scaling=False,
featurizer=None,
treatment_featurizer=None,
Expand All @@ -493,6 +495,8 @@ def __init__(self, *,
self.scaling = scaling
self.param_list_y = param_list_y
self.param_list_t = param_list_t
self.scoring_y = scoring_y
self.scoring_t = scoring_t
self.verbose = verbose
self.cv = cv
self.grid_folds = grid_folds
Expand All @@ -514,10 +518,10 @@ def _gen_featurizer(self):

def _gen_model_y(self): # New
if self.model_y == 'auto':
model_y = SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y,
model_y = SearchEstimatorList(estimator_list=WeightedLassoCVWrapper(random_state=self.random_state), param_grid_list=self.param_list_y, scoring=self.scoring_y,
scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state)
else:
model_y = clone(SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y,
model_y = clone(SearchEstimatorList(estimator_list=self.model_y, param_grid_list=self.param_list_y, scoring=self.scoring_y,
scaling=self.scaling, verbose=self.verbose, cv=self.cv, n_jobs=self.n_jobs, random_state=self.random_state), safe=False)
# if self.model_y == 'auto':
# model_y = WeightedLassoCVWrapper(random_state=self.random_state)
Expand All @@ -527,15 +531,13 @@ def _gen_model_y(self): # New
self.linear_first_stages, self.discrete_treatment)

def _gen_model_t(self): # New
# import pdb
# pdb.set_trace()
if self.model_t == 'auto':
if self.discrete_treatment:
model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t,
model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t, scoring=self.scoring_t,
scaling=self.scaling, verbose=self.verbose, cv=WeightedStratifiedKFold(random_state=self.random_state), is_discrete=self.discrete_treatment,
n_jobs=self.n_jobs, random_state=self.random_state)
else:
model_t = SearchEstimatorList(estimator_list=self.model_t, param_grid_list=self.param_list_t,
model_t = SearchEstimatorList(estimator_list=WeightedLassoCVWrapper(random_state=self.random_state), param_grid_list=self.param_list_t, scoring=self.scoring_t,
scaling=self.scaling, verbose=self.verbose, cv=self.cv, is_discrete=self.discrete_treatment,
n_jobs=self.n_jobs, random_state=self.random_state)

Expand Down
23 changes: 11 additions & 12 deletions econml/sklearn_extensions/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ def __init__(self, estimator_list=['linear', 'forest'], param_grid_list=None, sc
self.error_score = error_score
self.return_train_score = return_train_score
self.is_discrete = is_discrete
self.supported_models = ['linear', 'forest', 'gbf', 'nnet', 'poly']

def fit(self, X, y, *, sample_weight=None, groups=None):
# print(groups)
Expand Down Expand Up @@ -400,6 +401,11 @@ def fit(self, X, y, *, sample_weight=None, groups=None):
self.best_params_ = {}
return self
for estimator, param_grid in zip(self.complete_estimator_list, self.param_grid_list):
if self.verbose:
if is_polynomial_pipeline(estimator):
print(f"Processing estimator: {type(estimator.named_steps['linear']).__name__}")
else:
print(f"Processing estimator: {type(estimator).__name__}")
try:
if self.random_state != None:
if has_random_state(model=estimator):
Expand All @@ -408,8 +414,6 @@ def fit(self, X, y, *, sample_weight=None, groups=None):
estimator = estimator.set_params(linear__random_state=self.random_state)
else:
estimator.set_params(random_state=self.random_state)
print(estimator) # Note Delete this
print(param_grid) # Note Delete this
# pdb.set_trace() # Note Delete this
temp_search = GridSearchCV(estimator, param_grid, scoring=self.scoring,
n_jobs=self.n_jobs, refit=self.refit, cv=self.cv, verbose=self.verbose,
Expand Down Expand Up @@ -441,8 +445,11 @@ def fit(self, X, y, *, sample_weight=None, groups=None):
# This warning catches a problem after fit has run with no exception, however if there is no cv_results_ this indicates a failed fit operation.
warning_msg = f"Warning: estimator {estimator} and param_grid {param_grid} failed has no attribute cv_results_."
warnings.warn(warning_msg, category=FitFailedWarning)

self.best_ind_ = np.argmax([search.best_score_ for search in self._search_list])
try:
self.best_ind_ = np.argmax([search.best_score_ for search in self._search_list])
except Exception as e:
warning_msg = f"Failed for estimator {estimator} and param_grid {param_grid} with this error {e}."
raise Exception(warning_msg) from e
self.best_estimator_ = self._search_list[self.best_ind_].best_estimator_
self.best_score_ = self._search_list[self.best_ind_].best_score_
self.best_params_ = self._search_list[self.best_ind_].best_params_
Expand All @@ -465,14 +472,6 @@ def predict(self, X):
def predict_proba(self, X):
return self.best_estimator_.predict_proba(X)

def refit(self, X, y):
# Refits the best estimator using the entire dataset.
if self.best_estimator_ is None:
raise ValueError("No best estimator found. Please call the 'fit' method before calling 'refit'.")

self.best_estimator_.fit(X, y)
return self


class GridSearchCVList(BaseEstimator):
""" An extension of GridSearchCV that allows for passing a list of estimators each with their own
Expand Down
47 changes: 40 additions & 7 deletions econml/sklearn_extensions/model_selection_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from sklearn.exceptions import NotFittedError
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
from sklearn.model_selection import KFold
# from sklearn_extensions.model_selection import WeightedStratifiedKFold
import pandas as pd


def select_continuous_estimator(estimator_type, random_state):
Expand Down Expand Up @@ -57,6 +57,9 @@ def select_continuous_estimator(estimator_type, random_state):
poly = PolynomialFeatures()
linear = ElasticNetCV(random_state=random_state) # Play around with precompute and tolerance
return (Pipeline([('poly', poly), ('linear', linear)]))
elif estimator_type == 'weighted_lasso':
from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper
return WeightedLassoCVWrapper(random_state=random_state)
else:
raise ValueError(f"Unsupported estimator type: {estimator_type}")

Expand Down Expand Up @@ -278,18 +281,15 @@ def select_classification_hyperparameters(estimator):
elif isinstance(estimator, MLPClassifier):
return {
'hidden_layer_sizes': [(10,), (50,), (100,)],
'activation': ['relu'],
'solver': ['adam'],
'alpha': [0.0001, 0.001, 0.01],
'alpha': [0.0001, 0.01],
'learning_rate': ['constant', 'adaptive']
}
elif is_polynomial_pipeline(estimator=estimator):
return {
'poly__degree': [2, 3, 4],
'linear__Cs': [1, 10, 20],
'linear__max_iter': [100, 200],
'linear__penalty': ['l2'],
'linear__solver': ['saga', 'liblinear', 'lbfgs']
'linear__solver': ['saga', 'lbfgs']
}
else:
warnings.warn("No hyperparameters for this type of model. There are default hyperparameters for LogisticRegressionCV, RandomForestClassifier, MLPClassifier, and the polynomial pipleine", category=UserWarning)
Expand Down Expand Up @@ -324,7 +324,7 @@ def select_regression_hyperparameters(estimator):
elif isinstance(estimator, MLPRegressor):
return {
'hidden_layer_sizes': [(10,), (50,), (100,)],
'alpha': [0.0001, 0.001, 0.01],
'alpha': [0.0001, 0.01],
'learning_rate': ['constant', 'adaptive']
}
elif isinstance(estimator, GradientBoostingRegressor):
Expand Down Expand Up @@ -775,3 +775,36 @@ def make_param_multi_task(estimator, param_grid):
else:
param_grid_multi = {f'estimator__{k}': v for k, v in param_grid.items()}
return param_grid_multi


def preprocess_and_encode(data, cat_indices=None):
"""
Detects categorical columns, one-hot encodes them, and returns the preprocessed data.
Parameters:
- data: pandas DataFrame or numpy array
- cat_indices: list of column indices (or names for DataFrame) to be considered categorical
Returns:
- Preprocessed data in the format of the original input (DataFrame or numpy array)
"""
was_numpy = False
if isinstance(data, np.ndarray):
was_numpy = True
data = pd.DataFrame(data)

# If cat_indices is None, detect categorical columns using object type as a heuristic
if cat_indices is None:
cat_columns = data.select_dtypes(['object']).columns.tolist()
else:
if all(isinstance(i, int) for i in cat_indices): # if cat_indices are integer indices
cat_columns = data.columns[cat_indices].tolist()
else: # assume cat_indices are column names
cat_columns = cat_indices

data_encoded = pd.get_dummies(data, columns=cat_columns)

if was_numpy:
return data_encoded.values
else:
return data_encoded
Loading

0 comments on commit a96601a

Please sign in to comment.