Allow GLM models to output class for binary outcome for do-operation (#…

…386) * changes to regression * added option to output score or class in glm * updated fn reference
py-why · Mar 6, 2022 · 28d25a0 · 28d25a0
1 parent 9e21a67
commit 28d25a0
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 10 deletions.
diff --git a/dowhy/causal_estimators/generalized_linear_model_estimator.py b/dowhy/causal_estimators/generalized_linear_model_estimator.py
@@ -9,23 +9,38 @@ class GeneralizedLinearModelEstimator(RegressionEstimator):
     """Compute effect of treatment using a generalized linear model such as logistic regression.
 
     Implementation uses statsmodels.api.GLM.
-    Needs an additional parameter, "glm_family" to be specified in method_params. The value of this parameter can be any valid statsmodels.api families object. For example, to use logistic regression, specify "glm_family" as statsmodels.api.families.Binomial(). 
+    Needs an additional parameter, "glm_family" to be specified in method_params. The value of this parameter can be any valid statsmodels.api families object. For example, to use logistic regression, specify "glm_family" as statsmodels.api.families.Binomial().
 
     """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.logger.info("INFO: Using Generalized Linear Model Estimator")
-        if 'glm_family' in self.method_params:
-            self.family = self.method_params['glm_family']
+        if self.method_params is not None and 'glm_family' in self.method_params:
+                self.family = self.method_params['glm_family']
         else:
             raise ValueError("Need to specify the family for the generalized linear model. Provide a 'glm_family' parameter in method_params, such as statsmodels.api.families.Binomial() for logistic regression.")
+        self.predict_score = True
+        if self.method_params is not None and 'predict_score' in self.method_params:
+                self.predict_score = self.method_params['predict_score']
+        # Checking if Y is binary
+        outcome_values = self._data[self._outcome_name].astype(int).unique()
+        self.outcome_is_binary = all([v in [0,1] for v in outcome_values])
 
     def _build_model(self):
         features = self._build_features()
         model = sm.GLM(self._outcome, features, family=self.family).fit()
         return (features, model)
 
+    def predict_fn(self, model, features):
+        if self.outcome_is_binary:
+            if self.predict_score:
+                return model.predict(features)
+            else:
+                return (model.predict(features) > 0.5).astype(int)
+        else:
+            return model.predict(features)
+
     def construct_symbolic_estimator(self, estimand):
         expr = "b: " + ",".join(estimand.outcome_variable) + "~" + "Sigmoid("
         var_list = estimand.treatment_variable + estimand.get_backdoor_variables()

diff --git a/dowhy/causal_estimators/linear_regression_estimator.py b/dowhy/causal_estimators/linear_regression_estimator.py
@@ -29,6 +29,9 @@ def construct_symbolic_estimator(self, estimand):
             expr += "+" + "+".join(interaction_terms)
         return expr
 
+    def predict_fn(self, model, features):
+        return model.predict(features)
+
     def _build_model(self):
         features = self._build_features()
         model = sm.OLS(self._outcome, features).fit()

diff --git a/dowhy/causal_estimators/regression_estimator.py b/dowhy/causal_estimators/regression_estimator.py
@@ -121,6 +121,6 @@ def _do(self, treatment_val, data_df=None):
 
         new_features = self._build_features(treatment_values=interventional_treatment_2d,
                 data_df=data_df)
-        interventional_outcomes = self.model.predict(new_features)
+        interventional_outcomes = self.predict_fn(self.model, new_features)
         return interventional_outcomes.mean()
 
diff --git a/dowhy/datasets.py b/dowhy/datasets.py
@@ -11,9 +11,12 @@
 def sigmoid(x):
     return 1 / (1 + math.exp(-x))
 
-def stochastically_convert_to_binary(x):
+def convert_to_binary(x, stochastic=True):
     p = sigmoid(x)
-    return choice([0, 1], 1, p=[1-p, p])
+    if stochastic:
+        return choice([0, 1], 1, p=[1-p, p])
+    else:
+        return int(p > 0.5)
 
 def stochastically_convert_to_three_level_categorical(x):
     p = sigmoid(x)
@@ -57,6 +60,7 @@ def linear_dataset(beta, num_common_causes, num_samples, num_instruments=0,
                    treatment_is_binary=True,
                    treatment_is_category=False,
                    outcome_is_binary=False,
+                   stochastic_discretization=True,
                    num_discrete_common_causes=0,
                    num_discrete_instruments=0,
                    num_discrete_effect_modifiers=0,
@@ -113,7 +117,7 @@ def linear_dataset(beta, num_common_causes, num_samples, num_instruments=0,
         t += Z @ cz
     # Converting treatment to binary if required
     if treatment_is_binary:
-        t = np.vectorize(stochastically_convert_to_binary)(t)
+        t = np.vectorize(convert_to_binary)(t)
     elif treatment_is_category:
         t = np.vectorize(stochastically_convert_to_three_level_categorical)(t)
 
@@ -148,7 +152,7 @@ def _compute_y(t, W, X, FD, beta, c2, ce, cfd2):
         if num_effect_modifiers > 0:
             y += (X @ ce) * np.prod(t, axis=1)
         if outcome_is_binary:
-            y = np.vectorize(stochastically_convert_to_binary)(y)
+            y = np.vectorize(convert_to_binary)(y,stochastic_discretization)
         return y
 
     y = _compute_y(t, W_with_dummy, X_with_categorical, FD, beta, c2, ce, cfd2)
@@ -242,7 +246,7 @@ def simple_iv_dataset(beta, num_samples,
     Z = np.random.normal(0, 1, (num_samples, num_instruments))
     t = np.random.normal(0, 1, (num_samples, num_treatments)) + Z @ cz + W @ c1
     if treatment_is_binary:
-        t = np.vectorize(stochastically_convert_to_binary)(t)
+        t = np.vectorize(convert_to_binary)(t)
 
     def _compute_y(t, W, beta, c2):
         y = t @ beta + W @ c2

diff --git a/tests/causal_estimators/test_generalized_linear_model_estimator.py b/tests/causal_estimators/test_generalized_linear_model_estimator.py
@@ -30,7 +30,8 @@ def test_average_treatment_effect(self, error_tolerance, Estimator,
                 method_params={
                     'num_ci_simulations': 10,
                     'num_null_simulations': 10,
-                    'glm_family': sm.families.Binomial()
+                    'glm_family': sm.families.Binomial(),
+                    'predict_score': True
                     }
                 )