judithabk6 · judithabk6 · Jul 10, 2024 · Mar 28, 2024 · Mar 28, 2024 · Apr 4, 2024
diff --git a/.github/workflows/code-cov.yaml b/.github/workflows/code-cov.yaml
@@ -39,6 +39,8 @@ jobs:
         dependencies: 'NA'
         install-pandoc: false
         packages: |
+          [email protected]
+          [email protected]
           grf
           causalweight
           mediation
@@ -53,6 +55,7 @@ jobs:
 
     - name: Run tests with coverage
       run: |
+        export LD_LIBRARY_PATH=$(python -m rpy2.situation LD_LIBRARY_PATH):${LD_LIBRARY_PATH}
         pytest --cov=med_bench --cov-report=xml
 
     - name: Upload coverage to Codecov

diff --git a/.github/workflows/tests-with-R.yaml b/.github/workflows/tests-with-R.yaml
@@ -39,6 +39,8 @@ jobs:
         dependencies: 'NA'
         install-pandoc: false
         packages: |
+          [email protected]
+          [email protected]
           grf
           causalweight
           mediation
@@ -53,4 +55,5 @@ jobs:
 
     - name: Run tests
       run: |
-        pytest
+        export LD_LIBRARY_PATH=$(python -m rpy2.situation LD_LIBRARY_PATH):${LD_LIBRARY_PATH}
+        pytest
diff --git a/src/med_bench/mediation.py b/src/med_bench/mediation.py
@@ -18,7 +18,7 @@
                               _estimate_mediator_density,
                               _estimate_treatment_probabilities,
                               _get_classifier, _get_regressor)
-from .utils.utils import r_dependency_required
+from .utils.utils import r_dependency_required, _check_input
 
 ALPHAS = np.logspace(-5, 5, 8)
 CV_FOLDS = 5
@@ -90,6 +90,9 @@ def mediation_IPW(y, t, m, x, trim, regularization=True, forest=False,
     int
             number of used observations (non trimmed)
     """
+    # check input
+    y, t, m, x = _check_input(y, t, m, x, setting='multidimensional')
+
     # estimate propensities
     classifier_t_x = _get_classifier(regularization, forest, calibration)
     classifier_t_xm = _get_classifier(regularization, forest, calibration)
@@ -179,12 +182,13 @@ def mediation_coefficient_product(y, t, m, x, interaction=False,
         alphas = ALPHAS
     else:
         alphas = [TINY]
-    if len(x.shape) == 1:
-        x = x.reshape(-1, 1)
-    if len(m.shape) == 1:
-        m = m.reshape(-1, 1)
+
+    # check input
+    y, t, m, x = _check_input(y, t, m, x, setting='multidimensional') 
+
     if len(t.shape) == 1:
         t = t.reshape(-1, 1)
+
     coef_t_m = np.zeros(m.shape[1])
     for i in range(m.shape[1]):
         m_reg = RidgeCV(alphas=alphas, cv=CV_FOLDS)\
@@ -248,17 +252,20 @@ def mediation_g_formula(y, t, m, x, interaction=False, forest=False,
     calibration : str, default=sigmoid
             calibration mode; for example using a sigmoid function
     """
+    # check input
+    y, t, m, x = _check_input(y, t, m, x, setting='binary')
+
     # estimate mediator densities
     classifier_m = _get_classifier(regularization, forest, calibration)
-    f_00x, f_01x, f_10x, f_11x, _, _ = _estimate_mediator_density(t, m, x, y,
+    f_00x, f_01x, f_10x, f_11x, _, _ = _estimate_mediator_density(y, t, m, x,
                                                                   crossfit,
                                                                   classifier_m,
                                                                   interaction)
 
     # estimate conditional mean outcomes
     regressor_y = _get_regressor(regularization, forest)
     mu_00x, mu_01x, mu_10x, mu_11x, _, _ = (
-        _estimate_conditional_mean_outcome(t, m, x, y, crossfit, regressor_y,
+        _estimate_conditional_mean_outcome(y, t, m, x, crossfit, regressor_y,
                                            interaction))
 
     # G computation
@@ -319,10 +326,9 @@ def alternative_estimator(y, t, m, x, regularization=True):
         alphas = ALPHAS
     else:
         alphas = [TINY]
-    if len(x.shape) == 1:
-        x = x.reshape(-1, 1)
-    if len(m.shape) == 1:
-        m = m.reshape(-1, 1)
+
+    # check input
+    y, t, m, x = _check_input(y, t, m, x, setting='multidimensional')
     treated = (t == 1)
 
     # computation of direct effect
@@ -433,29 +439,9 @@ def mediation_multiply_robust(y, t, m, x, interaction=False, forest=False,
         - If x, t, m, or y don't have the same length.
         - If m is not binary.
     """
-    # Format checking
-    if len(y) != len(y.ravel()):
-        raise ValueError("Multidimensional y is not supported")
-    if len(t) != len(t.ravel()):
-        raise ValueError("Multidimensional t is not supported")
-    if len(m) != len(m.ravel()):
-        raise ValueError("Multidimensional m is not supported")
-
-    n = len(y)
-    if len(x.shape) == 1:
-        x.reshape(n, 1)
-    if len(m.shape) == 1:
-        m.reshape(n, 1)
-
-    dim_m = m.shape[1]
-    if n * dim_m != sum(m.ravel() == 1) + sum(m.ravel() == 0):
-        raise ValueError("m is not binary")
+    # check input
+    y, t, m, x = _check_input(y, t, m, x, setting='binary')
 
-    y = y.ravel()
-    t = t.ravel()
-    m = m.ravel()
-    if n != len(x) or n != len(m) or n != len(t):
-        raise ValueError("Inputs don't have the same number of observations")
 
     # estimate propensities
     classifier_t_x = _get_classifier(regularization, forest, calibration)
@@ -466,15 +452,15 @@ def mediation_multiply_robust(y, t, m, x, interaction=False, forest=False,
     # estimate mediator densities
     classifier_m = _get_classifier(regularization, forest, calibration)
     f_00x, f_01x, f_10x, f_11x, f_m0x, f_m1x = (
-        _estimate_mediator_density(t, m, x, y, crossfit,
+        _estimate_mediator_density(y, t, m, x, crossfit,
                                    classifier_m, interaction))
     f = f_00x, f_01x, f_10x, f_11x
 
     # estimate conditional mean outcomes
     regressor_y = _get_regressor(regularization, forest)
     regressor_cross_y = _get_regressor(regularization, forest)
     mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 = (
-        _estimate_cross_conditional_mean_outcome(t, m, x, y, crossfit,
+        _estimate_cross_conditional_mean_outcome(y, t, m, x, crossfit,
                                                  regressor_y,
                                                  regressor_cross_y, f,
                                                  interaction))
@@ -574,7 +560,10 @@ def r_mediate(y, t, m, x, interaction=False):
     Rstats = rpackages.importr('stats')
     base = rpackages.importr('base')
 
+    # check input
+    y, t, m, x = _check_input(y, t, m, x, setting='binary')
     m = m.ravel()
+
     var_names = [[y, 'y'],
                  [t, 't'],
                  [m, 'm'],
@@ -629,7 +618,10 @@ def r_mediation_g_estimator(y, t, m, x):
     plmed = rpackages.importr('plmed')
     base = rpackages.importr('base')
 
+    # check input
+    y, t, m, x = _check_input(y, t, m, x, setting='binary')
     m = m.ravel()
+
     var_names = [[y, 'y'],
                  [t, 't'],
                  [m, 'm'],
@@ -713,6 +705,9 @@ def r_mediation_dml(y, t, m, x, trim=0.05, order=1):
     causalweight = rpackages.importr('causalweight')
     base = rpackages.importr('base')
 
+    # check input
+    y, t, m, x = _check_input(y, t, m, x, setting='multidimensional')
+
     x_r, t_r, m_r, y_r = [base.as_matrix(_convert_array_to_R(uu)) for uu in
                           (x, t, m, y)]
     res = causalweight.medDML(y_r, t_r, m_r, x_r, trim=trim, order=order)
@@ -805,25 +800,9 @@ def mediation_dml(y, t, m, x, forest=False, crossfit=0, trim=0.05, clip=1e-6,
         - If t or y are multidimensional.
         - If x, t, m, or y don't have the same length.
     """
-    # check format
-    if len(y) != len(y.ravel()):
-        raise ValueError("Multidimensional y is not supported")
-
-    if len(t) != len(t.ravel()):
-        raise ValueError("Multidimensional t is not supported")
-
+    # check input
+    y, t, m, x = _check_input(y, t, m, x, setting='multidimensional')
     n = len(y)
-    t = t.ravel()
-    y = y.ravel()
-
-    if n != len(x) or n != len(m) or n != len(t):
-        raise ValueError("Inputs don't have the same number of observations")
-
-    if len(x.shape) == 1:
-        x.reshape(n, 1)
-
-    if len(m.shape) == 1:
-        m.reshape(n, 1)
 
     nobs = 0
 
@@ -850,7 +829,7 @@ def mediation_dml(y, t, m, x, forest=False, crossfit=0, trim=0.05, clip=1e-6,
     regressor_cross_y = _get_regressor(regularization, forest)
 
     mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 = (
-        _estimate_cross_conditional_mean_outcome_nesting(t, m, x, y, crossfit,
+        _estimate_cross_conditional_mean_outcome_nesting(y, t, m, x, crossfit,
                                                          regressor_y,
                                                          regressor_cross_y))
 

diff --git a/src/med_bench/utils/nuisances.py b/src/med_bench/utils/nuisances.py
@@ -119,10 +119,6 @@ def _estimate_treatment_probabilities(t, m, x, crossfit, clf_t_x, clf_t_xm):
 
     p_x, p_xm = [np.zeros(n) for h in range(2)]
     # compute propensity scores
-    if len(x.shape) == 1:
-        x = x.reshape(-1, 1)
-    if len(m.shape) == 1:
-        m = m.reshape(-1, 1)
     if len(t.shape) == 1:
         t = t.reshape(-1, 1)
 
@@ -143,7 +139,7 @@ def _estimate_treatment_probabilities(t, m, x, crossfit, clf_t_x, clf_t_xm):
     return p_x, p_xm
 
 
-def _estimate_mediator_density(t, m, x, y, crossfit, clf_m, interaction):
+def _estimate_mediator_density(y, t, m, x, crossfit, clf_m, interaction):
     """
     Estimate mediator density f(M|T,X)
     with train test lists from crossfitting
@@ -164,8 +160,6 @@ def _estimate_mediator_density(t, m, x, y, crossfit, clf_m, interaction):
         probabilities f(M|T=1,X)
     """
     n = len(y)
-    if len(x.shape) == 1:
-        x = x.reshape(-1, 1)
 
     if len(t.shape) == 1:
         t = t.reshape(-1, 1)
@@ -206,7 +200,7 @@ def _estimate_mediator_density(t, m, x, y, crossfit, clf_m, interaction):
     return f_00x, f_01x, f_10x, f_11x, f_m0x, f_m1x
 
 
-def _estimate_conditional_mean_outcome(t, m, x, y, crossfit, reg_y,
+def _estimate_conditional_mean_outcome(y, t, m, x, crossfit, reg_y,
                                        interaction):
     """
     Estimate conditional mean outcome E[Y|T,M,X]
@@ -228,12 +222,7 @@ def _estimate_conditional_mean_outcome(t, m, x, y, crossfit, reg_y,
         conditional mean outcome estimates E[Y|T=1,M,X]
     """
     n = len(y)
-    if len(x.shape) == 1:
-        x = x.reshape(-1, 1)
-    if len(m.shape) == 1:
-        mr = m.reshape(-1, 1)
-    else:
-        mr = np.copy(m)
+    mr = np.copy(m)
     if len(t.shape) == 1:
         t = t.reshape(-1, 1)
 
@@ -275,7 +264,7 @@ def _estimate_conditional_mean_outcome(t, m, x, y, crossfit, reg_y,
     return mu_00x, mu_01x, mu_10x, mu_11x, mu_0mx, mu_1mx
 
 
-def _estimate_cross_conditional_mean_outcome(t, m, x, y, crossfit, reg_y,
+def _estimate_cross_conditional_mean_outcome(y, t, m, x, crossfit, reg_y,
                                              reg_cross_y, f, interaction):
     """
     Estimate the conditional mean outcome,
@@ -397,7 +386,7 @@ def _estimate_cross_conditional_mean_outcome(t, m, x, y, crossfit, reg_y,
     return mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1
 
 
-def _estimate_cross_conditional_mean_outcome_nesting(t, m, x, y, crossfit,
+def _estimate_cross_conditional_mean_outcome_nesting(y, t, m, x, crossfit,
                                                      reg_y, reg_cross_y):
     """
     Estimate treatment probabilities and the conditional mean outcome,

diff --git a/src/med_bench/utils/utils.py b/src/med_bench/utils/utils.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 
+
 import subprocess
 import warnings
 
@@ -158,3 +159,82 @@ def _convert_array_to_R(x):
     elif len(x.shape) == 2:
         return robjects.r.matrix(robjects.FloatVector(x.ravel()),
                                  nrow=x.shape[0], byrow='TRUE')
+
+
+def _check_input(y, t, m, x, setting):
+    """
+    internal function to check inputs. `_check_input` adjusts the dimension
+    of the input (matrix or vectors), and raises an error 
+    - if the size of input is not adequate,
+    - or if the type of input is not supported (cotinuous treatment or
+    non-binary one-dimensional mediator if the specified setting parameter
+    is binary)
+
+    Parameters
+    ----------
+    y : array-like, shape (n_samples)
+        Outcome value for each unit, continuous
+
+    t : array-like, shape (n_samples)
+        Treatment value for each unit, binary
+
+    m : array-like, shape (n_samples, n_mediators)
+        Mediator value for each unit, binary and unidimensional
+
+    x : array-like, shape (n_samples, n_features_covariates)
+        Covariates value for each unit, continuous
+
+    setting : string
+    ('binary', 'continuous', 'multidimensional') value for the mediator
+
+    Returns
+    -------
+    y_converted : array-like, shape (n_samples,)
+        Outcome value for each unit, continuous
+
+    t_converted : array-like, shape (n_samples,)
+        Treatment value for each unit, binary
+
+    m_converted : array-like, shape (n_samples, n_mediators)
+        Mediator value for each unit, binary and unidimensional
+
+    x_converted : array-like, shape (n_samples, n_features_covariates)
+        Covariates value for each unit, continuous
+    """
+    # check format
+    if len(y) != len(y.ravel()):
+        raise ValueError("Multidimensional y (outcome) is not supported")
+
+    if len(t) != len(t.ravel()):
+        raise ValueError("Multidimensional t (exposure) is not supported")
+
+    if len(np.unique(t)) != 2:
+        raise ValueError("Only a binary t (exposure) is supported")
+
+    n = len(y)
+    t_converted = t.ravel()
+    y_converted = y.ravel()
+
+    if n != len(x) or n != len(m) or n != len(t):
+        raise ValueError("Inputs don't have the same number of observations")
+
+    if len(x.shape) == 1:
+        x_converted = x.reshape(n, 1)
+    else:
+        x_converted = x
+
+    if len(m.shape) == 1:
+        m_converted = m.reshape(n, 1)
+    else:
+        m_converted = m
+
+    if (m_converted.shape[1] >1) and (setting != 'multidimensional'):
+        raise ValueError("Multidimensional m (mediator) is not supported")
+
+    if (setting == 'binary') and (len(np.unique(m)) != 2):
+        raise ValueError(
+            "Only a binary one-dimensional m (mediator) is supported")
+
+    return y_converted, t_converted, m_converted, x_converted
+
+