Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Input check #81

Merged
merged 20 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/code-cov.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ jobs:
dependencies: 'NA'
install-pandoc: false
packages: |
[email protected]
[email protected]
grf
causalweight
mediation
Expand All @@ -53,6 +55,7 @@ jobs:

- name: Run tests with coverage
run: |
export LD_LIBRARY_PATH=$(python -m rpy2.situation LD_LIBRARY_PATH):${LD_LIBRARY_PATH}
pytest --cov=med_bench --cov-report=xml

- name: Upload coverage to Codecov
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/tests-with-R.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ jobs:
dependencies: 'NA'
install-pandoc: false
packages: |
[email protected]
[email protected]
grf
causalweight
mediation
Expand All @@ -53,4 +55,5 @@ jobs:

- name: Run tests
run: |
pytest
export LD_LIBRARY_PATH=$(python -m rpy2.situation LD_LIBRARY_PATH):${LD_LIBRARY_PATH}
pytest
87 changes: 33 additions & 54 deletions src/med_bench/mediation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
_estimate_mediator_density,
_estimate_treatment_probabilities,
_get_classifier, _get_regressor)
from .utils.utils import r_dependency_required
from .utils.utils import r_dependency_required, _check_input

ALPHAS = np.logspace(-5, 5, 8)
CV_FOLDS = 5
Expand Down Expand Up @@ -90,6 +90,9 @@ def mediation_IPW(y, t, m, x, trim, regularization=True, forest=False,
int
number of used observations (non trimmed)
"""
# check input
y, t, m, x = _check_input(y, t, m, x, setting='multidimensional')

# estimate propensities
classifier_t_x = _get_classifier(regularization, forest, calibration)
classifier_t_xm = _get_classifier(regularization, forest, calibration)
Expand Down Expand Up @@ -179,12 +182,13 @@ def mediation_coefficient_product(y, t, m, x, interaction=False,
alphas = ALPHAS
else:
alphas = [TINY]
if len(x.shape) == 1:
x = x.reshape(-1, 1)
if len(m.shape) == 1:
m = m.reshape(-1, 1)

# check input
y, t, m, x = _check_input(y, t, m, x, setting='multidimensional')

if len(t.shape) == 1:
t = t.reshape(-1, 1)

coef_t_m = np.zeros(m.shape[1])
for i in range(m.shape[1]):
m_reg = RidgeCV(alphas=alphas, cv=CV_FOLDS)\
Expand Down Expand Up @@ -248,17 +252,20 @@ def mediation_g_formula(y, t, m, x, interaction=False, forest=False,
calibration : str, default=sigmoid
calibration mode; for example using a sigmoid function
"""
# check input
y, t, m, x = _check_input(y, t, m, x, setting='binary')

# estimate mediator densities
classifier_m = _get_classifier(regularization, forest, calibration)
f_00x, f_01x, f_10x, f_11x, _, _ = _estimate_mediator_density(t, m, x, y,
f_00x, f_01x, f_10x, f_11x, _, _ = _estimate_mediator_density(y, t, m, x,
crossfit,
classifier_m,
interaction)

# estimate conditional mean outcomes
regressor_y = _get_regressor(regularization, forest)
mu_00x, mu_01x, mu_10x, mu_11x, _, _ = (
_estimate_conditional_mean_outcome(t, m, x, y, crossfit, regressor_y,
_estimate_conditional_mean_outcome(y, t, m, x, crossfit, regressor_y,
interaction))

# G computation
Expand Down Expand Up @@ -319,10 +326,9 @@ def alternative_estimator(y, t, m, x, regularization=True):
alphas = ALPHAS
else:
alphas = [TINY]
if len(x.shape) == 1:
x = x.reshape(-1, 1)
if len(m.shape) == 1:
m = m.reshape(-1, 1)

# check input
y, t, m, x = _check_input(y, t, m, x, setting='multidimensional')
treated = (t == 1)

# computation of direct effect
Expand Down Expand Up @@ -433,29 +439,9 @@ def mediation_multiply_robust(y, t, m, x, interaction=False, forest=False,
- If x, t, m, or y don't have the same length.
- If m is not binary.
"""
# Format checking
if len(y) != len(y.ravel()):
raise ValueError("Multidimensional y is not supported")
if len(t) != len(t.ravel()):
raise ValueError("Multidimensional t is not supported")
if len(m) != len(m.ravel()):
raise ValueError("Multidimensional m is not supported")

n = len(y)
if len(x.shape) == 1:
x.reshape(n, 1)
if len(m.shape) == 1:
m.reshape(n, 1)

dim_m = m.shape[1]
if n * dim_m != sum(m.ravel() == 1) + sum(m.ravel() == 0):
raise ValueError("m is not binary")
# check input
y, t, m, x = _check_input(y, t, m, x, setting='binary')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please make sure that the argument order y, t, m, x is there in all functions. Sometimes, there is t, m, x, y


y = y.ravel()
t = t.ravel()
m = m.ravel()
if n != len(x) or n != len(m) or n != len(t):
raise ValueError("Inputs don't have the same number of observations")

# estimate propensities
classifier_t_x = _get_classifier(regularization, forest, calibration)
Expand All @@ -466,15 +452,15 @@ def mediation_multiply_robust(y, t, m, x, interaction=False, forest=False,
# estimate mediator densities
classifier_m = _get_classifier(regularization, forest, calibration)
f_00x, f_01x, f_10x, f_11x, f_m0x, f_m1x = (
_estimate_mediator_density(t, m, x, y, crossfit,
_estimate_mediator_density(y, t, m, x, crossfit,
classifier_m, interaction))
f = f_00x, f_01x, f_10x, f_11x

# estimate conditional mean outcomes
regressor_y = _get_regressor(regularization, forest)
regressor_cross_y = _get_regressor(regularization, forest)
mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 = (
_estimate_cross_conditional_mean_outcome(t, m, x, y, crossfit,
_estimate_cross_conditional_mean_outcome(y, t, m, x, crossfit,
regressor_y,
regressor_cross_y, f,
interaction))
Expand Down Expand Up @@ -574,7 +560,10 @@ def r_mediate(y, t, m, x, interaction=False):
Rstats = rpackages.importr('stats')
base = rpackages.importr('base')

# check input
y, t, m, x = _check_input(y, t, m, x, setting='binary')
m = m.ravel()

var_names = [[y, 'y'],
[t, 't'],
[m, 'm'],
Expand Down Expand Up @@ -629,7 +618,10 @@ def r_mediation_g_estimator(y, t, m, x):
plmed = rpackages.importr('plmed')
base = rpackages.importr('base')

# check input
y, t, m, x = _check_input(y, t, m, x, setting='binary')
m = m.ravel()

var_names = [[y, 'y'],
[t, 't'],
[m, 'm'],
Expand Down Expand Up @@ -713,6 +705,9 @@ def r_mediation_dml(y, t, m, x, trim=0.05, order=1):
causalweight = rpackages.importr('causalweight')
base = rpackages.importr('base')

# check input
y, t, m, x = _check_input(y, t, m, x, setting='multidimensional')

x_r, t_r, m_r, y_r = [base.as_matrix(_convert_array_to_R(uu)) for uu in
(x, t, m, y)]
res = causalweight.medDML(y_r, t_r, m_r, x_r, trim=trim, order=order)
Expand Down Expand Up @@ -805,25 +800,9 @@ def mediation_dml(y, t, m, x, forest=False, crossfit=0, trim=0.05, clip=1e-6,
- If t or y are multidimensional.
- If x, t, m, or y don't have the same length.
"""
# check format
if len(y) != len(y.ravel()):
raise ValueError("Multidimensional y is not supported")

if len(t) != len(t.ravel()):
raise ValueError("Multidimensional t is not supported")

# check input
y, t, m, x = _check_input(y, t, m, x, setting='multidimensional')
n = len(y)
t = t.ravel()
y = y.ravel()

if n != len(x) or n != len(m) or n != len(t):
raise ValueError("Inputs don't have the same number of observations")

if len(x.shape) == 1:
x.reshape(n, 1)

if len(m.shape) == 1:
m.reshape(n, 1)

nobs = 0

Expand All @@ -850,7 +829,7 @@ def mediation_dml(y, t, m, x, forest=False, crossfit=0, trim=0.05, clip=1e-6,
regressor_cross_y = _get_regressor(regularization, forest)

mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1 = (
_estimate_cross_conditional_mean_outcome_nesting(t, m, x, y, crossfit,
_estimate_cross_conditional_mean_outcome_nesting(y, t, m, x, crossfit,
regressor_y,
regressor_cross_y))

Expand Down
21 changes: 5 additions & 16 deletions src/med_bench/utils/nuisances.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,6 @@ def _estimate_treatment_probabilities(t, m, x, crossfit, clf_t_x, clf_t_xm):

p_x, p_xm = [np.zeros(n) for h in range(2)]
# compute propensity scores
if len(x.shape) == 1:
x = x.reshape(-1, 1)
if len(m.shape) == 1:
m = m.reshape(-1, 1)
if len(t.shape) == 1:
t = t.reshape(-1, 1)

Expand All @@ -143,7 +139,7 @@ def _estimate_treatment_probabilities(t, m, x, crossfit, clf_t_x, clf_t_xm):
return p_x, p_xm


def _estimate_mediator_density(t, m, x, y, crossfit, clf_m, interaction):
def _estimate_mediator_density(y, t, m, x, crossfit, clf_m, interaction):
"""
Estimate mediator density f(M|T,X)
with train test lists from crossfitting
Expand All @@ -164,8 +160,6 @@ def _estimate_mediator_density(t, m, x, y, crossfit, clf_m, interaction):
probabilities f(M|T=1,X)
"""
n = len(y)
if len(x.shape) == 1:
x = x.reshape(-1, 1)

if len(t.shape) == 1:
t = t.reshape(-1, 1)
Expand Down Expand Up @@ -206,7 +200,7 @@ def _estimate_mediator_density(t, m, x, y, crossfit, clf_m, interaction):
return f_00x, f_01x, f_10x, f_11x, f_m0x, f_m1x


def _estimate_conditional_mean_outcome(t, m, x, y, crossfit, reg_y,
def _estimate_conditional_mean_outcome(y, t, m, x, crossfit, reg_y,
interaction):
"""
Estimate conditional mean outcome E[Y|T,M,X]
Expand All @@ -228,12 +222,7 @@ def _estimate_conditional_mean_outcome(t, m, x, y, crossfit, reg_y,
conditional mean outcome estimates E[Y|T=1,M,X]
"""
n = len(y)
if len(x.shape) == 1:
x = x.reshape(-1, 1)
if len(m.shape) == 1:
mr = m.reshape(-1, 1)
else:
mr = np.copy(m)
mr = np.copy(m)
if len(t.shape) == 1:
t = t.reshape(-1, 1)

Expand Down Expand Up @@ -275,7 +264,7 @@ def _estimate_conditional_mean_outcome(t, m, x, y, crossfit, reg_y,
return mu_00x, mu_01x, mu_10x, mu_11x, mu_0mx, mu_1mx


def _estimate_cross_conditional_mean_outcome(t, m, x, y, crossfit, reg_y,
def _estimate_cross_conditional_mean_outcome(y, t, m, x, crossfit, reg_y,
reg_cross_y, f, interaction):
"""
Estimate the conditional mean outcome,
Expand Down Expand Up @@ -397,7 +386,7 @@ def _estimate_cross_conditional_mean_outcome(t, m, x, y, crossfit, reg_y,
return mu_0mx, mu_1mx, E_mu_t0_t0, E_mu_t0_t1, E_mu_t1_t0, E_mu_t1_t1


def _estimate_cross_conditional_mean_outcome_nesting(t, m, x, y, crossfit,
def _estimate_cross_conditional_mean_outcome_nesting(y, t, m, x, crossfit,
reg_y, reg_cross_y):
"""
Estimate treatment probabilities and the conditional mean outcome,
Expand Down
80 changes: 80 additions & 0 deletions src/med_bench/utils/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd


import subprocess
import warnings

Expand Down Expand Up @@ -158,3 +159,82 @@ def _convert_array_to_R(x):
elif len(x.shape) == 2:
return robjects.r.matrix(robjects.FloatVector(x.ravel()),
nrow=x.shape[0], byrow='TRUE')


def _check_input(y, t, m, x, setting):
"""
internal function to check inputs. `_check_input` adjusts the dimension
of the input (matrix or vectors), and raises an error
- if the size of input is not adequate,
- or if the type of input is not supported (cotinuous treatment or
non-binary one-dimensional mediator if the specified setting parameter
is binary)

Parameters
----------
y : array-like, shape (n_samples)
Outcome value for each unit, continuous

t : array-like, shape (n_samples)
Treatment value for each unit, binary

m : array-like, shape (n_samples, n_mediators)
Mediator value for each unit, binary and unidimensional

x : array-like, shape (n_samples, n_features_covariates)
Covariates value for each unit, continuous

setting : string
('binary', 'continuous', 'multidimensional') value for the mediator

Returns
-------
y_converted : array-like, shape (n_samples,)
Outcome value for each unit, continuous

t_converted : array-like, shape (n_samples,)
Treatment value for each unit, binary

m_converted : array-like, shape (n_samples, n_mediators)
Mediator value for each unit, binary and unidimensional

x_converted : array-like, shape (n_samples, n_features_covariates)
Covariates value for each unit, continuous
"""
# check format
if len(y) != len(y.ravel()):
raise ValueError("Multidimensional y (outcome) is not supported")

if len(t) != len(t.ravel()):
raise ValueError("Multidimensional t (exposure) is not supported")

if len(np.unique(t)) != 2:
raise ValueError("Only a binary t (exposure) is supported")

n = len(y)
t_converted = t.ravel()
y_converted = y.ravel()

if n != len(x) or n != len(m) or n != len(t):
raise ValueError("Inputs don't have the same number of observations")

if len(x.shape) == 1:
x_converted = x.reshape(n, 1)
else:
x_converted = x

if len(m.shape) == 1:
m_converted = m.reshape(n, 1)
else:
m_converted = m

if (m_converted.shape[1] >1) and (setting != 'multidimensional'):
raise ValueError("Multidimensional m (mediator) is not supported")

if (setting == 'binary') and (len(np.unique(m)) != 2):
raise ValueError(
"Only a binary one-dimensional m (mediator) is supported")

return y_converted, t_converted, m_converted, x_converted


Loading
Loading