diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7aa835b..8d26380 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -21,7 +21,7 @@ jobs: strategy: matrix: - python-version: ["3.7", "3.8", "3.9"] + python-version: ["3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v2 @@ -39,7 +39,14 @@ jobs: - name: Install conda packages shell: bash -l {0} - run: mamba install -c conda-forge biom-format patsy pytest xarray scikit-bio flake8 arviz cmdstanpy + run: mamba install -c conda-forge biom-format patsy pytest xarray scikit-bio flake8 arviz + + # Temp req before CmdStanPy cuts a new release + - name: Install develop branch of CmdStanPy + shell: bash -l {0} + run: > + pip install git+https://github.com/stan-dev/cmdstanpy.git@develop; + install_cmdstan - name: Install BIRDMAn shell: bash -l {0} diff --git a/Makefile b/Makefile index 57e99d7..7aca32a 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ stylecheck: flake8 birdman/*.py tests/*.py setup.py pytest: - pytest + pytest tests -W ignore::FutureWarning documentation: cd docs && make html diff --git a/birdman/default_models.py b/birdman/default_models.py index 6eeef9c..1ec485c 100644 --- a/birdman/default_models.py +++ b/birdman/default_models.py @@ -192,7 +192,8 @@ def __init__( dims={ "beta_var": ["covariate"], "log_lhood": ["tbl_sample"], - "y_predict": ["tbl_sample"] + "y_predict": ["tbl_sample"], + "inv_disp": [] }, coords={ "covariate": self.colnames, diff --git a/birdman/diagnostics.py b/birdman/diagnostics.py index 48fd572..ea4987f 100644 --- a/birdman/diagnostics.py +++ b/birdman/diagnostics.py @@ -102,6 +102,7 @@ def r2_score(inference_object: az.InferenceData) -> pd.Series: :returns: Bayesian :math:`R^2` & standard deviation :rtype: pd.Series """ + if "observed_data" not in inference_object.groups(): raise ValueError("Inference data is missing observed data!") diff --git a/birdman/inference.py b/birdman/inference.py index 995c14d..15fe528 100644 --- a/birdman/inference.py +++ b/birdman/inference.py @@ -1,118 +1,65 @@ -from typing import List, Sequence +from typing import List, Sequence, Union import arviz as az -from cmdstanpy import CmdStanMCMC +from cmdstanpy import CmdStanMCMC, CmdStanVB +import numpy as np import xarray as xr -from .util import _drop_data - -def full_fit_to_inference( - fit: CmdStanMCMC, +def fit_to_inference( + fit: Union[CmdStanMCMC, CmdStanVB], + chains: int, + draws: int, params: Sequence[str], coords: dict, dims: dict, - alr_params: Sequence[str] = None, posterior_predictive: str = None, log_likelihood: str = None, -) -> az.InferenceData: - """Convert fitted Stan model into inference object. - - :param fit: Fitted model - :type params: CmdStanMCMC - - :param params: Posterior fitted parameters to include - :type params: Sequence[str] - - :param coords: Mapping of entries in dims to labels - :type coords: dict - - :param dims: Dimensions of parameters in the model - :type dims: dict - - :param alr_params: Parameters to convert from ALR to CLR - :type alr_params: Sequence[str], optional - - :param posterior_predictive: Name of posterior predictive values from - Stan model to include in ``arviz`` InferenceData object - :type posterior_predictive: str, optional - - :param log_likelihood: Name of log likelihood values from Stan model - to include in ``arviz`` InferenceData object - :type log_likelihood: str, optional - - :returns: ``arviz`` InferenceData object with selected values - :rtype: az.InferenceData - """ +): if log_likelihood is not None and log_likelihood not in dims: raise KeyError("Must include dimensions for log-likelihood!") if posterior_predictive is not None and posterior_predictive not in dims: raise KeyError("Must include dimensions for posterior predictive!") - inference = az.from_cmdstanpy( - fit, - coords=coords, - log_likelihood=log_likelihood, - posterior_predictive=posterior_predictive, - dims=dims - ) - - vars_to_drop = set(inference.posterior.data_vars).difference(params) - inference.posterior = _drop_data(inference.posterior, vars_to_drop) + das = dict() - return inference + for param in params: + data = fit.stan_variable(param) + _dims = dims[param] + _coords = {k: coords[k] for k in _dims} -def single_feature_fit_to_inference( - fit: CmdStanMCMC, - params: Sequence[str], - coords: dict, - dims: dict, - posterior_predictive: str = None, - log_likelihood: str = None, -) -> az.InferenceData: - """Convert single feature fit to InferenceData. + das[param] = stan_var_to_da(data, _coords, _dims, chains, draws) - :param fit: Single feature fit with CmdStanPy - :type fit: cmdstanpy.CmdStanMCMC + if log_likelihood: + data = fit.stan_variable(log_likelihood) - :param params: Posterior fitted parameters to include - :type params: Sequence[str] + _dims = dims[log_likelihood] + _coords = {k: coords[k] for k in _dims} - :param coords: Coordinates to use for annotating Inference dims - :type coords: dict + ll_da = stan_var_to_da(data, _coords, _dims, chains, draws) + ll_ds = xr.Dataset({log_likelihood: ll_da}) + else: + ll_ds = None - :param dims: Dimensions of parameters in fitted model - :type dims: dict + if posterior_predictive: + data = fit.stan_variable(posterior_predictive) - :param posterior_predictive: Name of variable holding PP values - :type posterior_predictive: str + _dims = dims[posterior_predictive] + _coords = {k: coords[k] for k in _dims} - :param log_likelihood: Name of variable holding LL values - :type log_likelihood: str + pp_da = stan_var_to_da(data, _coords, _dims, chains, draws) + pp_ds = xr.Dataset({posterior_predictive: pp_da}) + else: + pp_ds = None - :returns: InferenceData object of single feature - :rtype: az.InferenceData - """ - _coords = coords.copy() - if "feature" in coords: - _coords.pop("feature") - - _dims = dims.copy() - for k, v in _dims.items(): - if "feature" in v: - v.remove("feature") - - feat_inf = az.from_cmdstanpy( - posterior=fit, - posterior_predictive=posterior_predictive, - log_likelihood=log_likelihood, - coords=_coords, - dims=_dims + inf = az.InferenceData( + posterior=xr.Dataset(das), + log_likelihood=ll_ds, + posterior_predictive=pp_ds ) - vars_to_drop = set(feat_inf.posterior.data_vars).difference(params) - feat_inf.posterior = _drop_data(feat_inf.posterior, vars_to_drop) - return feat_inf + + return inf def concatenate_inferences( @@ -165,3 +112,27 @@ def concatenate_inferences( all_group_inferences.append(group_inf) return az.concat(*all_group_inferences) + + +def stan_var_to_da( + data: np.ndarray, + coords: dict, + dims: dict, + chains: int, + draws: int +): + """Convert Stan variable draws to xr.DataArray. + + """ + data = np.stack(np.split(data, chains)) + + coords["draw"] = np.arange(draws) + coords["chain"] = np.arange(chains) + dims = ["chain", "draw"] + dims + + da = xr.DataArray( + data, + coords=coords, + dims=dims, + ) + return da diff --git a/birdman/model_base.py b/birdman/model_base.py index 1911f8f..ceb7d25 100644 --- a/birdman/model_base.py +++ b/birdman/model_base.py @@ -8,7 +8,7 @@ import pandas as pd from patsy import dmatrix -from .inference import full_fit_to_inference, single_feature_fit_to_inference +from .inference import fit_to_inference class BaseModel(ABC): @@ -114,7 +114,7 @@ def add_parameters(self, param_dict: dict = None): def fit_model( self, - method: str = "mcmc", + method: str = "vi", num_draws: int = 500, mcmc_warmup: int = None, mcmc_chains: int = 4, @@ -167,6 +167,9 @@ def fit_model( mcmc_kwargs = mcmc_kwargs or dict() mcmc_warmup = mcmc_warmup or mcmc_warmup + self.num_chains = mcmc_chains + self.num_draws = num_draws + self.fit = self.sm.sample( chains=mcmc_chains, parallel_chains=mcmc_chains, @@ -179,6 +182,9 @@ def fit_model( elif method == "vi": vi_kwargs = vi_kwargs or dict() + self.num_chains = 1 + self.num_draws = num_draws + self.fit = self.sm.variational( data=self.dat, iter=vi_iter, @@ -224,8 +230,10 @@ def to_inference(self) -> az.InferenceData: """ self._check_fit_for_inf() - inference = full_fit_to_inference( + inference = fit_to_inference( fit=self.fit, + chains=self.num_chains, + draws=self.num_draws, params=self.params, coords=self.coords, dims=self.dims, @@ -246,6 +254,7 @@ def to_inference(self) -> az.InferenceData: } ) inference = az.concat(inference, obs) + return inference @@ -269,8 +278,10 @@ def to_inference(self) -> az.InferenceData: """ self._check_fit_for_inf() - inference = single_feature_fit_to_inference( + inference = fit_to_inference( fit=self.fit, + chains=self.num_chains, + draws=self.num_draws, params=self.params, coords=self.coords, dims=self.dims, diff --git a/tests/conftest.py b/tests/conftest.py index 4ee42cb..b3d2020 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,7 +47,7 @@ def model(): metadata=md, ) nb.compile_model() - nb.fit_model(mcmc_chains=4, num_draws=100) + nb.fit_model(method="mcmc", mcmc_chains=4, num_draws=100) return nb @@ -76,7 +76,7 @@ def single_feat_model(): ) nb.compile_model() - nb.fit_model(mcmc_chains=4, num_draws=100) + nb.fit_model(method="mcmc", mcmc_chains=4, num_draws=100) return nb diff --git a/tests/test_custom_model.py b/tests/test_custom_model.py index 2da859a..988b0f5 100644 --- a/tests/test_custom_model.py +++ b/tests/test_custom_model.py @@ -40,10 +40,11 @@ def test_custom_model(table_biom, metadata): }, ) custom_model.compile_model() - custom_model.fit_model(num_draws=100, mcmc_chains=4, seed=42) + custom_model.fit_model(method="mcmc", num_draws=100, mcmc_chains=4, + seed=42) inference = custom_model.to_inference() - assert set(inference.groups()) == {"posterior", "sample_stats"} + assert set(inference.groups()) == {"posterior"} ds = inference.posterior assert ds.coords._names == {"chain", "covariate", "draw", "feature_alr"} diff --git a/tests/test_inference.py b/tests/test_inference.py index 06e89bc..cb55ac0 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -26,8 +26,10 @@ def dataset_comparison(self, model, ds): assert (ds.coords["chain"] == [0, 1, 2, 3]).all() def test_serial_to_inference(self, example_model): - inf = mu.full_fit_to_inference( + inf = mu.fit_to_inference( fit=example_model.fit, + chains=4, + draws=100, coords={ "feature": example_model.feature_names, "feature_alr": example_model.feature_names[1:], @@ -41,7 +43,6 @@ def test_serial_to_inference(self, example_model): "y_predict": ["tbl_sample", "feature"] }, params=["beta_var", "inv_disp"], - alr_params=["beta_var"] ) self.dataset_comparison(example_model, inf.posterior) @@ -49,8 +50,10 @@ def test_serial_to_inference(self, example_model): # Posterior predictive & log likelihood class TestPPLL: def test_serial_ppll(self, example_model): - inf = mu.full_fit_to_inference( + inf = mu.fit_to_inference( fit=example_model.fit, + chains=4, + draws=100, coords={ "feature": example_model.feature_names, "feature_alr": example_model.feature_names[1:], @@ -64,7 +67,6 @@ def test_serial_ppll(self, example_model): "y_predict": ["sample", "feature"] }, params=["beta_var", "inv_disp"], - alr_params=["beta_var"], posterior_predictive="y_predict", log_likelihood="log_lhood", ) diff --git a/tests/test_model.py b/tests/test_model.py index d8d6131..d1efdee 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -49,7 +49,7 @@ def test_nb_lme(self, table_biom, metadata): metadata=md, ) nb_lme.compile_model() - nb_lme.fit_model(num_draws=100) + nb_lme.fit_model(method="mcmc", num_draws=100) inf = nb_lme.to_inference() post = inf.posterior @@ -68,13 +68,13 @@ def test_single_feat(self, table_biom, metadata): metadata=md, ) nb.compile_model() - nb.fit_model(num_draws=100) + nb.fit_model(method="mcmc", num_draws=100) class TestToInference: def test_serial_to_inference(self, example_model): inference_data = example_model.to_inference() - target_groups = {"posterior", "sample_stats", "log_likelihood", + target_groups = {"posterior", "log_likelihood", "posterior_predictive", "observed_data"} assert set(inference_data.groups()) == target_groups @@ -159,5 +159,6 @@ def test_iteration_fit(self, table_biom, metadata): for fit, model in model_iterator: model.compile_model() - model.fit_model(num_draws=100, mcmc_chains=4, seed=42) + model.fit_model(method="mcmc", num_draws=100, mcmc_chains=4, + seed=42) _ = model.to_inference()