Skip to content

Commit

Permalink
Allow GLM models to output class for binary outcome for do-operation (#…
Browse files Browse the repository at this point in the history
…386)

* changes to regression

* added option to output score or class in glm

* updated fn reference
  • Loading branch information
amit-sharma authored Mar 6, 2022
1 parent 9e21a67 commit 28d25a0
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 10 deletions.
21 changes: 18 additions & 3 deletions dowhy/causal_estimators/generalized_linear_model_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,38 @@ class GeneralizedLinearModelEstimator(RegressionEstimator):
"""Compute effect of treatment using a generalized linear model such as logistic regression.
Implementation uses statsmodels.api.GLM.
Needs an additional parameter, "glm_family" to be specified in method_params. The value of this parameter can be any valid statsmodels.api families object. For example, to use logistic regression, specify "glm_family" as statsmodels.api.families.Binomial().
Needs an additional parameter, "glm_family" to be specified in method_params. The value of this parameter can be any valid statsmodels.api families object. For example, to use logistic regression, specify "glm_family" as statsmodels.api.families.Binomial().
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.logger.info("INFO: Using Generalized Linear Model Estimator")
if 'glm_family' in self.method_params:
self.family = self.method_params['glm_family']
if self.method_params is not None and 'glm_family' in self.method_params:
self.family = self.method_params['glm_family']
else:
raise ValueError("Need to specify the family for the generalized linear model. Provide a 'glm_family' parameter in method_params, such as statsmodels.api.families.Binomial() for logistic regression.")
self.predict_score = True
if self.method_params is not None and 'predict_score' in self.method_params:
self.predict_score = self.method_params['predict_score']
# Checking if Y is binary
outcome_values = self._data[self._outcome_name].astype(int).unique()
self.outcome_is_binary = all([v in [0,1] for v in outcome_values])

def _build_model(self):
features = self._build_features()
model = sm.GLM(self._outcome, features, family=self.family).fit()
return (features, model)

def predict_fn(self, model, features):
if self.outcome_is_binary:
if self.predict_score:
return model.predict(features)
else:
return (model.predict(features) > 0.5).astype(int)
else:
return model.predict(features)

def construct_symbolic_estimator(self, estimand):
expr = "b: " + ",".join(estimand.outcome_variable) + "~" + "Sigmoid("
var_list = estimand.treatment_variable + estimand.get_backdoor_variables()
Expand Down
3 changes: 3 additions & 0 deletions dowhy/causal_estimators/linear_regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ def construct_symbolic_estimator(self, estimand):
expr += "+" + "+".join(interaction_terms)
return expr

def predict_fn(self, model, features):
return model.predict(features)

def _build_model(self):
features = self._build_features()
model = sm.OLS(self._outcome, features).fit()
Expand Down
2 changes: 1 addition & 1 deletion dowhy/causal_estimators/regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,6 @@ def _do(self, treatment_val, data_df=None):

new_features = self._build_features(treatment_values=interventional_treatment_2d,
data_df=data_df)
interventional_outcomes = self.model.predict(new_features)
interventional_outcomes = self.predict_fn(self.model, new_features)
return interventional_outcomes.mean()

14 changes: 9 additions & 5 deletions dowhy/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@
def sigmoid(x):
return 1 / (1 + math.exp(-x))

def stochastically_convert_to_binary(x):
def convert_to_binary(x, stochastic=True):
p = sigmoid(x)
return choice([0, 1], 1, p=[1-p, p])
if stochastic:
return choice([0, 1], 1, p=[1-p, p])
else:
return int(p > 0.5)

def stochastically_convert_to_three_level_categorical(x):
p = sigmoid(x)
Expand Down Expand Up @@ -57,6 +60,7 @@ def linear_dataset(beta, num_common_causes, num_samples, num_instruments=0,
treatment_is_binary=True,
treatment_is_category=False,
outcome_is_binary=False,
stochastic_discretization=True,
num_discrete_common_causes=0,
num_discrete_instruments=0,
num_discrete_effect_modifiers=0,
Expand Down Expand Up @@ -113,7 +117,7 @@ def linear_dataset(beta, num_common_causes, num_samples, num_instruments=0,
t += Z @ cz
# Converting treatment to binary if required
if treatment_is_binary:
t = np.vectorize(stochastically_convert_to_binary)(t)
t = np.vectorize(convert_to_binary)(t)
elif treatment_is_category:
t = np.vectorize(stochastically_convert_to_three_level_categorical)(t)

Expand Down Expand Up @@ -148,7 +152,7 @@ def _compute_y(t, W, X, FD, beta, c2, ce, cfd2):
if num_effect_modifiers > 0:
y += (X @ ce) * np.prod(t, axis=1)
if outcome_is_binary:
y = np.vectorize(stochastically_convert_to_binary)(y)
y = np.vectorize(convert_to_binary)(y,stochastic_discretization)
return y

y = _compute_y(t, W_with_dummy, X_with_categorical, FD, beta, c2, ce, cfd2)
Expand Down Expand Up @@ -242,7 +246,7 @@ def simple_iv_dataset(beta, num_samples,
Z = np.random.normal(0, 1, (num_samples, num_instruments))
t = np.random.normal(0, 1, (num_samples, num_treatments)) + Z @ cz + W @ c1
if treatment_is_binary:
t = np.vectorize(stochastically_convert_to_binary)(t)
t = np.vectorize(convert_to_binary)(t)

def _compute_y(t, W, beta, c2):
y = t @ beta + W @ c2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def test_average_treatment_effect(self, error_tolerance, Estimator,
method_params={
'num_ci_simulations': 10,
'num_null_simulations': 10,
'glm_family': sm.families.Binomial()
'glm_family': sm.families.Binomial(),
'predict_score': True
}
)

0 comments on commit 28d25a0

Please sign in to comment.