Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add model builder #87

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
258 changes: 258 additions & 0 deletions birdman/builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
from pathlib import Path
from pkg_resources import resource_filename

import biom
from jinja2 import Template
import numpy as np
import pandas as pd
from patsy import dmatrix

from birdman import SingleFeatureModel, TableModel

J2_DIR = Path(resource_filename("birdman", "jinja2"))
SF_TEMPLATE = J2_DIR / "negative_binomial_single.j2.stan"
FULL_TEMPLATE = J2_DIR / "negative_binomial_full.j2.stan"


def create_single_feature_model(
table: biom.Table,
metadata: pd.DataFrame,
stan_file_path: Path,
fixed_effects: list = None,
random_effects: list = None,
beta_prior: float = 5.0,
group_var_prior: float = 1.0,
inv_disp_sd_prior: float = 0.5
) -> SingleFeatureModel:
"""Build SingleFeatureModel.

:param table: Feature table (features x samples)
:type table: biom.table.Table

:param metadata: Metadata for design matrix
:type metadata: pd.DataFrame

:param stan_file: Path to save rendered Stan file
:type stan_file: pathlib.Path

:param fixed_effects: List of fixed effects to include in model
:type fixed_effects: list

:param random_effects: List of random effects to include in model
:type random_effects: list

:param beta_prior: Standard deviation for normally distributed prior values
of beta, defaults to 5.0
:type beta_prior: float

:param group_var_prior: Standard deviation for normally distributed prior
values of random effects, defaults to 1.0
:type group_var_prio: float

:param inv_disp_sd_prior: Standard deviation for lognormally distributed
prior values of 1/phi, defaults to 0.5
:type inv_disp_sd: float

:returns: SingleFeatureModel with specified fixed and random effects
:rtype: birdman.model_base.SingleFeatureModel
"""
if not set(table.ids()) == set(metadata.index):
raise ValueError("Sample IDs must match!")

fe_formula = " + ".join(fixed_effects)
dmat = dmatrix(fe_formula, metadata, return_type="dataframe")

sf_stanfile = _render_stanfile(SF_TEMPLATE, metadata, random_effects)

with open(stan_file_path, "w") as f:
f.write(sf_stanfile)

class _SingleFeatureModel(SingleFeatureModel):
def __init__(self, feature_id: str):
super().__init__(table=table, feature_id=feature_id,
model_path=stan_file_path)
self.feature_id = feature_id
values = table.data(
id=feature_id,
axis="observation",
dense=True
).astype(int)

A = np.log(1 / table.shape[0])

param_dict = {
"y": values,
"x": dmat,
"p": dmat.shape[1],
"depth": np.log(table.sum("sample")),
"A": A,
"B_p": beta_prior,
"inv_disp_sd": inv_disp_sd_prior,
"re_p": group_var_prior
}

self.re_dict = dict()

for group_var in random_effects:
group_var_series = metadata[group_var].loc[self.sample_names]
group_subj_map = (
group_var_series.astype("category").cat.codes + 1
)
param_dict[f"{group_var}_map"] = group_subj_map

self.re_dict[group_var] = np.sort(group_var_series.unique())

self.add_parameters(param_dict)

self.specify_model(
params=["beta_var", "inv_disp"],
dims={
"beta_var": ["covariate"],
"log_lhood": ["tbl_sample"],
"y_predict": ["tbl_sample"],
"inv_disp": []
},
coords={
"covariate": dmat.columns,
"tbl_sample": self.sample_names,
},
include_observed_data=True,
posterior_predictive="y_predict",
log_likelihood="log_lhood"
)

return _SingleFeatureModel


def create_table_model(
table: biom.Table,
metadata: pd.DataFrame,
stan_file_path: Path,
fixed_effects: list = None,
random_effects: list = None,
beta_prior: float = 5.0,
group_var_prior: float = 1.0,
inv_disp_sd_prior: float = 0.5
):
"""Build TableModel.

:param table: Feature table (features x samples)
:type table: biom.table.Table

:param metadata: Metadata for design matrix
:type metadata: pd.DataFrame

:param stan_file: Path to save rendered Stan file
:type stan_file: pathlib.Path

:param fixed_effects: List of fixed effects to include in model
:type fixed_effects: list

:param random_effects: List of random effects to include in model
:type random_effects: list

:param beta_prior: Standard deviation for normally distributed prior values
of beta, defaults to 5.0
:type beta_prior: float

:param group_var_prior: Standard deviation for normally distributed prior
values of random effects, defaults to 1.0
:type group_var_prio: float

:param inv_disp_sd_prior: Standard deviation for lognormally distributed
prior values of 1/phi, defaults to 0.5
:type inv_disp_sd_prior: float

:returns: TableModel with specified fixed and random effects
:rtype: birdman.model_base.TableModel
"""
if not set(table.ids()) == set(metadata.index):
raise ValueError("Sample IDs must match!")

fe_formula = " + ".join(fixed_effects)
dmat = dmatrix(fe_formula, metadata, return_type="dataframe")

sf_stanfile = _render_stanfile(FULL_TEMPLATE, metadata, random_effects)

with open(stan_file_path, "w") as f:
f.write(sf_stanfile)

class _TableModel(TableModel):
def __init__(self):
super().__init__(table=table, model_path=stan_file_path)

A = np.log(1 / table.shape[0])

param_dict = {
"x": dmat,
"p": dmat.shape[1],
"depth": np.log(table.sum("sample")),
"A": A,
"B_p": beta_prior,
"inv_disp_sd": inv_disp_sd_prior,
"re_p": group_var_prior
}

self.re_dict = dict()

for group_var in random_effects:
group_var_series = metadata[group_var].loc[self.sample_names]
group_subj_map = (
group_var_series.astype("category").cat.codes + 1
)
param_dict[f"{group_var}_map"] = group_subj_map

self.re_dict[group_var] = np.sort(group_var_series.unique())

self.add_parameters(param_dict)

self.specify_model(
params=["beta_var", "inv_disp"],
dims={
"beta_var": ["covariate", "feature_alr"],
"log_lhood": ["tbl_sample", "feature"],
"y_predict": ["tbl_sample", "feature"],
"inv_disp": ["feature"]
},
coords={
"covariate": dmat.columns,
"tbl_sample": self.sample_names,
"feature": table.ids("observation"),
"feature_alr": table.ids("observation")[1:]
},
include_observed_data=True,
posterior_predictive="y_predict",
log_likelihood="log_lhood"
)

return _TableModel


def _render_stanfile(
template_path: Path,
metadata: pd.DataFrame,
random_effects: list = None
) -> str:
"""Render Stan file given fixed and random effects.

:param template_path: Path to Jinja2 template file
:type template_path: pathlib.Path

:param metadata: Metadata for design matrix
:type metadata: pd.DataFrame

:param random_effects: List of random effects to include in model
:type random_effects: list

:returns: Rendred Jinja2 template
:rtype: str
"""
re_dict = dict()
for group in random_effects:
n = len(metadata[group].unique())
re_dict[group] = n

with open(template_path, "r") as f:
stanfile = Template(f.read()).render({"re_dict": re_dict})

return stanfile
84 changes: 84 additions & 0 deletions birdman/jinja2/negative_binomial_full.j2.stan
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
data {
int<lower=1> N; // number of samples
int<lower=1> D; // number of features
int<lower=1> p; // number of covariates
real A; // mean intercept
vector[N] depth; // log sequencing depths of microbes
matrix[N, p] x; // covariate matrix
array[N, D] int y; // observed microbe abundances

real<lower=0> B_p; // stdev for beta normal prior
real<lower=0> inv_disp_sd; // stdev for inv disp lognormal prior

// Random Effects
real<lower=0> re_p; // stdev for random effect normal prior
{% for re_name, num_factors in re_dict.items() %}
array[N] int<lower=1, upper={{ num_factors }}> {{ re_name }}_map;
{%- endfor %}
// End Random Effects
}

parameters {
row_vector<offset=A, multiplier=B_p>[D-1] beta_0;
matrix<multiplier=B_p>[p-1, D-1] beta_x;
vector<lower=0>[D] inv_disp;

// Random Effects
{%- for re_name, num_factors in re_dict.items() %}
matrix[{{ num_factors }}, D-1] {{ re_name }}_eff;
{%- endfor %}
// End Random Effects
}

transformed parameters {
matrix[p, D-1] beta_var = append_row(beta_0, beta_x);
matrix[N, D-1] lam = x * beta_var;
matrix[N, D] lam_clr;

// Random Effects
for (n in 1:N) {
lam[n] += depth[n];
{%- for re_name, num_factors in re_dict.items() %}
lam[n] += {{ re_name }}_eff[{{ re_name }}_map[n]];
{%- endfor %}
}
// End Random Effects

lam_clr = append_col(to_vector(rep_array(0, N)), lam);
}

model {
inv_disp ~ lognormal(0., inv_disp_sd);

beta_0 ~ normal(A, B_p);
for (i in 1:D-1){
for (j in 1:p-1){
beta_x[j, i] ~ normal(0., B_p);
}
// Random Effects
{%- for re_name, num_factors in re_dict.items() %}
for (j in 1:{{ num_factors }}) {
{{ re_name }}_eff[j, i] ~ normal(0, re_p);
}
{%- endfor %}
// End Random Effects
}

for (n in 1:N){
for (i in 1:D){
target += neg_binomial_2_log_lpmf(y[n, i] | lam_clr[n, i], inv_disp[i]);
}
}
}

generated quantities {
array[N, D] int y_predict;
array[N, D] real log_lhood;

for (n in 1:N){
for (i in 1:D){
y_predict[n, i] = neg_binomial_2_log_rng(lam_clr[n, i], inv_disp[i]);
log_lhood[n, i] = neg_binomial_2_log_lpmf(y[n, i] | lam_clr[n, i], inv_disp[i]);
}
}
}
Loading