From c595aeed899469da326854e31da1bdce2a0cf8b1 Mon Sep 17 00:00:00 2001 From: Gibraan Rahman Date: Wed, 31 Mar 2021 17:29:19 -0700 Subject: [PATCH 1/7] update setup & docs with arviz master installation --- README.md | 3 ++- docs/index.rst | 1 + setup.py | 26 ++++++++++++++++++++++---- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 576e5bd..fcbc5d5 100644 --- a/README.md +++ b/README.md @@ -14,11 +14,12 @@ Currently BIRDMAn requires Python 3.7 or higher. ```bash conda install -c conda-forge dask biom-format patsy xarray pip install cmdstanpy +pip install git+git://github.com/arviz-devs/arviz.git git clone git@github.com:gibsramen/BIRDMAn.git cd BIRDMAn pip install . ``` -You have to install `cmdstan` through the `cmdstanpy.install_cmdstan` function first. +You have to install `cmdstan` through the `cmdstanpy.install_cmdstan` function first. See the [CmdStanPy documentation](https://mc-stan.org/cmdstanpy/installation.html#install-cmdstan) for details. See the [documentation](https://birdman.readthedocs.io/en/latest/?badge=latest) for details on usage. diff --git a/docs/index.rst b/docs/index.rst index 7d5dea0..2656dfd 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -21,6 +21,7 @@ The easiest way to install the required dependencies is through ``conda``. conda install -c conda-forge dask biom-format patsy xarray git clone git@github.com:gibsramen/BIRDMAn.git + pip install git+git://github.com/arviz-devs/arviz.git cd BIRDMAn pip install -e . diff --git a/setup.py b/setup.py index 28ee82f..73113c1 100644 --- a/setup.py +++ b/setup.py @@ -2,10 +2,28 @@ __version__ = "0.0.1" +# Inspired by the EMPress classifiers +classifiers = """ + Development Status :: 2 - Pre-Alpha + License :: OSI Approved :: BSD License + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Intended Audience :: Science/Research + Operating System :: POSIX + Operating System :: MacOS :: MacOS X + Topic :: Scientific/Engineering :: Bio-Informatics +""" +short_desc = "Framework for differential abundance using Bayesian inference" + setup( name="birdman", author="Gibraan Rahman", + author_email="grahman@eng.ucsd.edu", + description=short_desc, + url="https://github.com/gibsramen/BIRDMAn", version=__version__, + license='BSD-3-Clause', packages=find_packages(), include_package_data=True, package_data={"": ["*.stan"]}, @@ -16,9 +34,9 @@ "biom-format", "patsy", "xarray", - "arviz @ git+git://github.com/arviz-devs/arviz.git" - # Temporary solution until github.com/arviz-devs/arviz/pull/1579 - # is included in on PyPI + "pandas", + "arviz" ], - extras_require={"dev": ["pytest", "scikit-bio"]} + extras_require={"dev": ["pytest", "scikit-bio", "sphinx"]}, + classifiers=classifiers ) From 4239385b7c29bc49e241f06018330184224c71ea Mon Sep 17 00:00:00 2001 From: Gibraan Rahman Date: Wed, 31 Mar 2021 17:30:24 -0700 Subject: [PATCH 2/7] add viz docs change coord to coords in plot parameter estimate and remove axes labels --- birdman/visualization.py | 15 ++++++--------- docs/index.rst | 1 + docs/visualization.rst | 6 ++++++ tests/test_visualization.py | 4 ++-- 4 files changed, 15 insertions(+), 11 deletions(-) create mode 100644 docs/visualization.rst diff --git a/birdman/visualization.py b/birdman/visualization.py index 315ab90..8a41684 100644 --- a/birdman/visualization.py +++ b/birdman/visualization.py @@ -7,7 +7,7 @@ def plot_parameter_estimates( inference_object: az.InferenceData, parameter: str, - coord: dict = dict(), + coords: dict = dict(), num_std: float = 1.0 ): """Plot credible intervals of estimated parameters. @@ -18,8 +18,8 @@ def plot_parameter_estimates( :param parameter: Name of parameter to plot :type parameter: str - :param coord: Coordinates of parameter to plot - :type coord: dict, optional + :param coords: Coordinates of parameter to plot + :type coords: dict, optional :param num_std: Number of standard deviations to plot as error bars, defaults to 1.0 @@ -28,14 +28,14 @@ def plot_parameter_estimates( :returns: matplotlib axes figure """ posterior = inference_object.posterior - if len(posterior[parameter].coords) > 3 and not coord: + if len(posterior[parameter].coords) > 3 and not coords: raise ValueError( "Must provide coordinates if plotting multi-dimensional parameter" " estimates!" ) - param_means = posterior[parameter].sel(**coord).mean(["chain", "draw"]) - param_stds = posterior[parameter].sel(**coord).std(["chain", "draw"]) + param_means = posterior[parameter].sel(**coords).mean(["chain", "draw"]) + param_stds = posterior[parameter].sel(**coords).std(["chain", "draw"]) sort_indices = param_means.argsort().data param_means = param_means.data[sort_indices] param_stds = param_stds.data[sort_indices] @@ -45,9 +45,6 @@ def plot_parameter_estimates( ax.errorbar(x=x, y=param_means, yerr=param_stds*num_std, zorder=0) ax.scatter(x=x, y=param_means, color="black", zorder=2, s=5) - ax.set_xlabel("Feature") - ax.set_ylabel("Differential") - return ax diff --git a/docs/index.rst b/docs/index.rst index 2656dfd..f0247fb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -59,6 +59,7 @@ In Python: diagnostics stats util + visualization Indices and tables ------------------ diff --git a/docs/visualization.rst b/docs/visualization.rst new file mode 100644 index 0000000..b4a563a --- /dev/null +++ b/docs/visualization.rst @@ -0,0 +1,6 @@ +Visualization Functions +======================= + +.. automodule:: birdman.visualization + :members: + diff --git a/tests/test_visualization.py b/tests/test_visualization.py index 0f23100..a1b5219 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -8,7 +8,7 @@ def test_rank_plot_beta(self, example_inf): viz.plot_parameter_estimates( inference_object=example_inf, parameter="beta", - coord={"covariate": "host_common_name[T.long-tailed macaque]"}, + coords={"covariate": "host_common_name[T.long-tailed macaque]"}, num_std=1 ) @@ -19,7 +19,7 @@ def test_rank_plot_phi(self, example_inf): num_std=1 ) - def test_rank_plot_no_coord(self, example_inf): + def test_rank_plot_no_coords(self, example_inf): with pytest.raises(ValueError) as excinfo: viz.plot_parameter_estimates( inference_object=example_inf, From d447b14ba14b241cfb29ee81b8e76d391c62ead4 Mon Sep 17 00:00:00 2001 From: Gibraan Rahman Date: Thu, 1 Apr 2021 13:53:29 -0700 Subject: [PATCH 3/7] add custom model doc page --- docs/custom_model.rst | 315 ++++++++++++++++++++++++++++++++++++ docs/index.rst | 7 +- docs/models.rst | 3 + docs/user_defined_model.rst | 2 - 4 files changed, 323 insertions(+), 4 deletions(-) create mode 100644 docs/custom_model.rst delete mode 100644 docs/user_defined_model.rst diff --git a/docs/custom_model.rst b/docs/custom_model.rst new file mode 100644 index 0000000..d6cc437 --- /dev/null +++ b/docs/custom_model.rst @@ -0,0 +1,315 @@ +Implementing a custom model +=========================== + +One of the core features of BIRDMAn is that it is extensible to user-defined differential abundance models. This means that BIRDMAn has been designed from the beginning to support custom analysis. Custom modeling is implemented through user-written Stan files. + +Here we will walk through a simple custom model and how to incorporate it into BIRDMAn. We will use data from the study "Linking the effects of helminth infection, diet and the gut microbiota with human whole-blood signatures (repeated measurements)" (Qiita ID: 11913). Samples were taken from subjects before and after a deworming procedure. + +This paired design introduces a wrinkle in traditional differential abundance analysis: `pseudoreplication `_. It is statistically inadmissible to fail to account for repeated measurements as in the case of longitudinal data. One can account for this as a fixed effect in other tools, but really subject-level variation is likely better modeled as a random effect. For this example we will specify a linear mixed-effects (LME) model where ``subject`` is a random intercept. + +Downloading data from Qiita +------------------------------- + +First we will download the BIOM table and metadata from Qiita. + +.. code-block:: bash + + wget -O data.zip "https://qiita.ucsd.edu/public_artifact_download/?artifact_id=44773" + wget -O metadata.zip "https://qiita.ucsd.edu/public_download/?data=sample_information&study_id=107" + unzip data.zip + unzip metadata.zip + +We then import the data into Python so we can use BIRDMAn. + +.. code-block:: python + + import biom + import pandas as pd + + table = biom.load_table("BIOM/94270/reference-hit.biom") + metadata = pd.read_csv( + "templates/11913_20191016-112545.txt", + sep="\t", + index_col=0 + ) + +Processing metadata +------------------- + +We will now determine which subjects in the metadata have samples at both pre and post-deworm timepoints. + +.. code-block:: python + + subj_is_paired = ( + metadata + .groupby("host_subject_id") + .apply(lambda x: (x["time_point"].values == [1, 2]).all()) + ) + paired_subjs = subj_is_paired[subj_is_paired].index + paired_samps = metadata[metadata["host_subject_id"].isin(paired_subjs)].index + cols_to_keep = ["time_point", "host_subject_id"] + metadata_model = metadata.loc[paired_samps, cols_to_keep].dropna() + metadata_model["time_point"] = ( + metadata_model["time_point"].map({1: "pre-deworm", 2: "post-deworm"}) + ) + metadata_model["host_subject_id"] = "S" + metadata["host_subject_id"].astype(str) + metadata_model.head() + +.. list-table:: + :header-rows: 1 + :stub-columns: 1 + + * - sample-name + - time_point + - host_subject_id + * - 11913.102 + - pre-deworm + - S102 + * - 11913.102AF + - post-deworm + - S102 + * - 11913.1097 + - pre-deworm + - S1097 + * - 11913.1097AF + - post-deworm + - S1097 + * - 11913.119 + - pre-deworm + - S119 + +Filtering feature table +----------------------- + +This table has nearly 3000 features, most of which are likely lowly prevalent. We will first filter the table to only include the samples we previously described and then filter out features that are present in fewer than 5 samples. + +.. code-block:: python + + raw_tbl_df = table.to_dataframe() + samps_to_keep = sorted(list(set(raw_tbl_df.columns).intersection(metadata_model.index))) + filt_tbl_df = raw_tbl_df.loc[:, samps_to_keep] + prev = filt_tbl_df.clip(upper=1).sum(axis=1) + filt_tbl_df = filt_tbl_df.loc[prev[prev >= 5].index, :] + filt_tbl = biom.table.Table( + filt_tbl_df.values, + sample_ids=filt_tbl_df.columns, + observation_ids=filt_tbl_df.index + ) + +We now have a table of 269 features by 46 samples (23 subjects). This is a much more manageable size! + +Model specification +------------------- + +For this custom model we want to specify that ``time_point`` is a fixed effect and ``host_subject_id`` is a random effect. We are keeping this model relatively simple but you can imagine a more complicated model with random slopes, specified covariance structures, etc. Our model can thus be written as follows: + +.. math:: + + y_{ij} &\sim \textrm{NB}(\mu_{ij},\phi_j) + + \mu_{ij} &= n_i p_{ij} + + \textrm{alr}^{-1}(p_i) &= x_i \beta + z_i u + +Where :math:`z_i` represents the mapping of sample :math:`i` to subject and :math:`u` represents the subject coefficient vector. + +We also specify the following priors: + +.. math:: + + \beta_j &\sim \textrm{Normal}(0, B_p), B_p \in \mathbb{R}_{>0} + + \frac{1}{\phi_j} &\sim \textrm{Cauchy}(0, C_s), C_s \in + \mathbb{R}_{>0} + + u_i &\sim \textrm{Normal}(0, u_p), u_p \in \mathbb{R}_{>0} + + +Stan code +--------- + +We will save the below file to ``negative_binomial_re.stan`` so we can import and compile it in BIRDMAn. + + +.. code-block:: stan + + data { + int N; // number of sample IDs + int S; // number of groups (subjects) + int D; // number of dimensions + int p; // number of covariates + real depth[N]; // sequencing depths of microbes + matrix[N, p] x; // covariate matrix + int subj_ids[N]; // mapping of samples to subject IDs + int y[N, D]; // observed microbe abundances + real B_p; // stdev for covariate Beta Normal prior + real phi_s; // scale for dispersion Cauchy prior + real u_p; // stdev for subject intercept Normal prior + } + + parameters { + matrix[p, D-1] beta; + vector[D] reciprocal_phi; + vector[S] subj_int; + } + + transformed parameters { + matrix[N, D-1] lam; + matrix[N, D] lam_clr; + vector[D] phi; + + for (i in 1:D){ + phi[i] = 1. / reciprocal_phi[i]; + } + + lam = x*beta; // N x D-1 + for (n in 1:N){ + lam[n] += subj_int[subj_ids[n]]; + } + lam_clr = append_col(to_vector(rep_array(0, N)), lam); + } + + model { + // setting priors ... + for (i in 1:D){ + reciprocal_phi[i] ~ cauchy(0., phi_s); + } + for (i in 1:D-1){ + for (j in 1:p){ + beta[j, i] ~ normal(0., B_p); // uninformed prior + } + } + for (i in 1:S){ + subj_int[i] ~ normal(0., u_p); + } + + // generating counts + for (n in 1:N){ + for (i in 1:D){ + target += neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]); + } + } + } + + generated quantities { + matrix[N, D] y_predict; + matrix[N, D] log_lik; + + for (n in 1:N){ + for (i in 1:D){ + y_predict[n, i] = neg_binomial_2_log_rng(depth[n] + lam_clr[n, i], phi[i]); + log_lik[n, i] = neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]); + } + } + } + +Running BIRDMAn +--------------- + +We will now pass this file along with our table, metadata, and formula into BIRDMAn. Note that we are using the base ``Model`` class for our custom model. + +.. code-block:: python + + import birdman + + nb_lme = birdman.Model( # note that we are instantiating a base Model object + table=filt_tbl, + formula="C(time_point, Treatment('pre-deworm'))", + metadata=metadata_model.loc[samps_to_keep], + model_path="negative_binomial_re.stan", + num_iter=500, + chains=4, + seed=42 + ) + +We then want to update our data dictionary with the new parameters. + +By default BIRDMAn computes and includes: + +* ``y``: table data +* ``x``: covariate design matrix +* ``N``: number of samples +* ``D``: number of features +* ``p``: number of covariates (including Intercept) + +We want to add the necessary variables to be passed to Stan: + +* ``S``: total number of groups (subjects) +* ``subj_ids``: mapping of samples to subject +* ``B_p``: stdev prior for normally distributed covariate-feature coefficients +* ``phi_s``: scale prior for half-Cauchy distributed overdispersion coefficients +* ``depth``: log sampling depths of samples +* ``u_p``: stdev prior for normally distributed subject intercept shifts + +We want to provide ``subj_ids`` with a mapping of which sample corresponds to which subject. Stan does not understand strings so we encode each unique subject as an integer (starting at 1 because Stan 1-indexes arrays). + +.. code-block:: python + + import numpy as np + + group_var_series = metadata_model.loc[samps_to_keep]["host_subject_id"] + samp_subj_map = group_var_series.astype("category").cat.codes + 1 + groups = np.sort(group_var_series.unique()) + +Now we can add all the necessary parameters to BIRDMAn with the ``add_parameters`` function. + +.. code-block:: python + + param_dict = { + "S": len(groups), + "subj_ids": samp_subj_map.values, + "depth": np.log(filt_tbl.sum(axis="sample")), + "B_p": 3.0, + "phi_s": 3.0, + "u_p": 1.0 + } + nb_lme.add_parameters(param_dict) + +Finally, we compile and fit the model. + +.. note:: + + Fitting this model took approximately 6 minutes on my laptop. + +.. code-block:: python + + nb_lme.compile_model() + nb_lme.fit_model() + +Converting to ``InferenceData`` +------------------------------- + +With a custom model there is a bit more legwork involved in converting to the ``arviz.InferenceData`` data structure. We will step through each of the parameters in this example. + +* ``params``: List of parameters you want to include in the posterior draws (must match Stan code). +* ``dims``: Dictionary of dimensions of each parameter to include. Note that we also include the names of the variables for log likelihood and posterior predictive values, ``log_lik`` and ``y_predict`` respectively. +* ``coords``: Mapping of dimensions in ``dims`` to their indices. We internally save ``feature_names``, ``sample_names``, and ``colnames`` (names of covariates in design matrix). +* ``alr_params``: List of parameters which come out of Stan in ALR coordinates. These will be converted into centered CLR coordinates before being returned. +* ``posterior_predictive``: Name of variable holding posterior predictive values (optional). +* ``log_likelihood``: Name of variable holding log likelihood values (optional). +* ``include_observed_data``: Whether to include the original feature table as a group. This is useful for certain diagnostics. + +.. code-block:: python + + inference = nb_lme.to_inference_object( + params=["beta", "phi", "subj_int"], + dims={ + "beta": ["covariate", "feature"], + "phi": ["feature"], + "subj_int": ["subject"], + "log_lik": ["tbl_sample", "feature"], + "y_predict": ["tbl_sample", "feature"] + }, + coords={ + "feature": nb_lme.feature_names, + "covariate": nb_lme.colnames, + "subject": groups, + "tbl_sample": nb_lme.sample_names + }, + alr_params=["beta"], + posterior_predictive="y_predict", + log_likelihood="log_lik", + include_observed_data=True + ) + +With this you can use the rest of the BIRDMAn suite as usual or directly interact with the ``arviz`` library! diff --git a/docs/index.rst b/docs/index.rst index f0247fb..b6b294a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -46,10 +46,13 @@ In Python: .. toctree:: :maxdepth: 2 - :caption: Tutorial + :caption: User Guide default_model_example - user_defined_model + custom_model + diagnosting_model + working_with_stan_code + working_with_arviz .. toctree:: :maxdepth: 2 diff --git a/docs/models.rst b/docs/models.rst index 9b3977f..15d9841 100644 --- a/docs/models.rst +++ b/docs/models.rst @@ -14,5 +14,8 @@ Default Models -------------- .. autoclass:: birdman.default_models.NegativeBinomial + :members: .. autoclass:: birdman.default_models.NegativeBinomialLME + :members: .. autoclass:: birdman.default_models.Multinomial + :members: diff --git a/docs/user_defined_model.rst b/docs/user_defined_model.rst deleted file mode 100644 index 1537045..0000000 --- a/docs/user_defined_model.rst +++ /dev/null @@ -1,2 +0,0 @@ -Defining your own model -======================= From 1de99f7fbcd5f6e407033cafd2bdd8ac4ad641bc Mon Sep 17 00:00:00 2001 From: Gibraan Rahman Date: Thu, 1 Apr 2021 13:53:29 -0700 Subject: [PATCH 4/7] add custom model doc page --- docs/custom_model.rst | 315 ++++++++++++++++++++++++++++++++++++ docs/index.rst | 7 +- docs/models.rst | 3 + docs/user_defined_model.rst | 2 - 4 files changed, 323 insertions(+), 4 deletions(-) create mode 100644 docs/custom_model.rst delete mode 100644 docs/user_defined_model.rst diff --git a/docs/custom_model.rst b/docs/custom_model.rst new file mode 100644 index 0000000..40978e9 --- /dev/null +++ b/docs/custom_model.rst @@ -0,0 +1,315 @@ +Implementing a custom model +=========================== + +One of the core features of BIRDMAn is that it is extensible to user-defined differential abundance models. This means that BIRDMAn has been designed from the beginning to support custom analysis. Custom modeling is implemented through user-written Stan files. + +Here we will walk through a simple custom model and how to incorporate it into BIRDMAn. We will use data from the study "Linking the effects of helminth infection, diet and the gut microbiota with human whole-blood signatures (repeated measurements)" (Qiita ID: 11913). Samples were taken from subjects before and after a deworming procedure. + +This paired design introduces a wrinkle in traditional differential abundance analysis: `pseudoreplication `_. It is statistically inadmissible to fail to account for repeated measurements as in the case of longitudinal data. One can account for this as a fixed effect in other tools, but really subject-level variation is likely better modeled as a random effect. For this example we will specify a linear mixed-effects (LME) model where ``subject`` is a random intercept. + +Downloading data from Qiita +------------------------------- + +First we will download the BIOM table and metadata from Qiita. + +.. code-block:: bash + + wget -O data.zip "https://qiita.ucsd.edu/public_artifact_download/?artifact_id=44773" + wget -O metadata.zip "https://qiita.ucsd.edu/public_download/?data=sample_information&study_id=107" + unzip data.zip + unzip metadata.zip + +We then import the data into Python so we can use BIRDMAn. + +.. code-block:: python + + import biom + import pandas as pd + + table = biom.load_table("BIOM/94270/reference-hit.biom") + metadata = pd.read_csv( + "templates/11913_20191016-112545.txt", + sep="\t", + index_col=0 + ) + +Processing metadata +------------------- + +We will now determine which subjects in the metadata have samples at both pre and post-deworm timepoints. + +.. code-block:: python + + subj_is_paired = ( + metadata + .groupby("host_subject_id") + .apply(lambda x: (x["time_point"].values == [1, 2]).all()) + ) + paired_subjs = subj_is_paired[subj_is_paired].index + paired_samps = metadata[metadata["host_subject_id"].isin(paired_subjs)].index + cols_to_keep = ["time_point", "host_subject_id"] + metadata_model = metadata.loc[paired_samps, cols_to_keep].dropna() + metadata_model["time_point"] = ( + metadata_model["time_point"].map({1: "pre-deworm", 2: "post-deworm"}) + ) + metadata_model["host_subject_id"] = "S" + metadata["host_subject_id"].astype(str) + metadata_model.head() + +.. list-table:: + :header-rows: 1 + :stub-columns: 1 + + * - sample-name + - time_point + - host_subject_id + * - 11913.102 + - pre-deworm + - S102 + * - 11913.102AF + - post-deworm + - S102 + * - 11913.1097 + - pre-deworm + - S1097 + * - 11913.1097AF + - post-deworm + - S1097 + * - 11913.119 + - pre-deworm + - S119 + +Filtering feature table +----------------------- + +This table has nearly 3000 features, most of which are likely lowly prevalent. We will first filter the table to only include the samples we previously described and then filter out features that are present in fewer than 5 samples. + +.. code-block:: python + + raw_tbl_df = table.to_dataframe() + samps_to_keep = sorted(list(set(raw_tbl_df.columns).intersection(metadata_model.index))) + filt_tbl_df = raw_tbl_df.loc[:, samps_to_keep] + prev = filt_tbl_df.clip(upper=1).sum(axis=1) + filt_tbl_df = filt_tbl_df.loc[prev[prev >= 5].index, :] + filt_tbl = biom.table.Table( + filt_tbl_df.values, + sample_ids=filt_tbl_df.columns, + observation_ids=filt_tbl_df.index + ) + +We now have a table of 269 features by 46 samples (23 subjects). This is a much more manageable size! + +Model specification +------------------- + +For this custom model we want to specify that ``time_point`` is a fixed effect and ``host_subject_id`` is a random effect. We are keeping this model relatively simple but you can imagine a more complicated model with random slopes, specified covariance structures, etc. Our model can thus be written as follows: + +.. math:: + + y_{ij} &\sim \textrm{NB}(\mu_{ij},\phi_j) + + \mu_{ij} &= n_i p_{ij} + + \textrm{alr}^{-1}(p_i) &= x_i \beta + z_i u + +Where :math:`z_i` represents the mapping of sample :math:`i` to subject and :math:`u` represents the subject coefficient vector. + +We also specify the following priors: + +.. math:: + + \beta_j &\sim \textrm{Normal}(0, B_p), B_p \in \mathbb{R}_{>0} + + \frac{1}{\phi_j} &\sim \textrm{Cauchy}(0, C_s), C_s \in + \mathbb{R}_{>0} + + u_i &\sim \textrm{Normal}(0, u_p), u_p \in \mathbb{R}_{>0} + + +Stan code +--------- + +We will save the below file to ``negative_binomial_re.stan`` so we can import and compile it in BIRDMAn. + + +.. code-block:: stan + + data { + int N; // number of sample IDs + int S; // number of groups (subjects) + int D; // number of dimensions + int p; // number of covariates + real depth[N]; // sequencing depths of microbes + matrix[N, p] x; // covariate matrix + int subj_ids[N]; // mapping of samples to subject IDs + int y[N, D]; // observed microbe abundances + real B_p; // stdev for covariate Beta Normal prior + real phi_s; // scale for dispersion Cauchy prior + real u_p; // stdev for subject intercept Normal prior + } + + parameters { + matrix[p, D-1] beta; + vector[D] reciprocal_phi; + vector[S] subj_int; + } + + transformed parameters { + matrix[N, D-1] lam; + matrix[N, D] lam_clr; + vector[D] phi; + + for (i in 1:D){ + phi[i] = 1. / reciprocal_phi[i]; + } + + lam = x*beta; // N x D-1 + for (n in 1:N){ + lam[n] += subj_int[subj_ids[n]]; + } + lam_clr = append_col(to_vector(rep_array(0, N)), lam); + } + + model { + // setting priors ... + for (i in 1:D){ + reciprocal_phi[i] ~ cauchy(0., phi_s); + } + for (i in 1:D-1){ + for (j in 1:p){ + beta[j, i] ~ normal(0., B_p); // uninformed prior + } + } + for (i in 1:S){ + subj_int[i] ~ normal(0., u_p); + } + + // generating counts + for (n in 1:N){ + for (i in 1:D){ + target += neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]); + } + } + } + + generated quantities { + matrix[N, D] y_predict; + matrix[N, D] log_lik; + + for (n in 1:N){ + for (i in 1:D){ + y_predict[n, i] = neg_binomial_2_log_rng(depth[n] + lam_clr[n, i], phi[i]); + log_lik[n, i] = neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]); + } + } + } + +Running BIRDMAn +--------------- + +We will now pass this file along with our table, metadata, and formula into BIRDMAn. Note that we are using the base ``Model`` class for our custom model. + +.. code-block:: python + + import birdman + + nb_lme = birdman.Model( # note that we are instantiating a base Model object + table=filt_tbl, + formula="C(time_point, Treatment('pre-deworm'))", + metadata=metadata_model.loc[samps_to_keep], + model_path="negative_binomial_re.stan", + num_iter=500, + chains=4, + seed=42 + ) + +We then want to update our data dictionary with the new parameters. + +By default BIRDMAn computes and includes: + +* ``y``: table data +* ``x``: covariate design matrix +* ``N``: number of samples +* ``D``: number of features +* ``p``: number of covariates (including Intercept) + +We want to add the necessary variables to be passed to Stan: + +* ``S``: total number of groups (subjects) +* ``subj_ids``: mapping of samples to subject +* ``B_p``: stdev prior for normally distributed covariate-feature coefficients +* ``phi_s``: scale prior for half-Cauchy distributed overdispersion coefficients +* ``depth``: log sampling depths of samples +* ``u_p``: stdev prior for normally distributed subject intercept shifts + +We want to provide ``subj_ids`` with a mapping of which sample corresponds to which subject. Stan does not understand strings so we encode each unique subject as an integer (starting at 1 because Stan 1-indexes arrays). + +.. code-block:: python + + import numpy as np + + group_var_series = metadata_model.loc[samps_to_keep]["host_subject_id"] + samp_subj_map = group_var_series.astype("category").cat.codes + 1 + groups = np.sort(group_var_series.unique()) + +Now we can add all the necessary parameters to BIRDMAn with the ``add_parameters`` function. + +.. code-block:: python + + param_dict = { + "S": len(groups), + "subj_ids": samp_subj_map.values, + "depth": np.log(filt_tbl.sum(axis="sample")), + "B_p": 3.0, + "phi_s": 3.0, + "u_p": 1.0 + } + nb_lme.add_parameters(param_dict) + +Finally, we compile and fit the model. + +.. note:: + + Fitting this model took approximately 6 minutes on my laptop. + +.. code-block:: python + + nb_lme.compile_model() + nb_lme.fit_model() + +Converting to ``InferenceData`` +------------------------------- + +With a custom model there is a bit more legwork involved in converting to the ``arviz.InferenceData`` data structure. We will step through each of the parameters in this example. + +* ``params``: List of parameters you want to include in the posterior draws (must match Stan code). +* ``dims``: Dictionary of dimensions of each parameter to include. Note that we also include the names of the variables for log likelihood and posterior predictive values, ``log_lik`` and ``y_predict`` respectively. +* ``coords``: Mapping of dimensions in ``dims`` to their indices. We internally save ``feature_names``, ``sample_names``, and ``colnames`` (names of covariates in design matrix). +* ``alr_params``: List of parameters which come out of Stan in ALR coordinates. These will be converted into centered CLR coordinates before being returned. +* ``posterior_predictive``: Name of variable holding posterior predictive values (optional). +* ``log_likelihood``: Name of variable holding log likelihood values (optional). +* ``include_observed_data``: Whether to include the original feature table as a group. This is useful for certain diagnostics. + +.. code-block:: python + + inference = nb_lme.to_inference_object( + params=["beta", "phi", "subj_int"], + dims={ + "beta": ["covariate", "feature"], + "phi": ["feature"], + "subj_int": ["subject"], + "log_lik": ["tbl_sample", "feature"], + "y_predict": ["tbl_sample", "feature"] + }, + coords={ + "feature": nb_lme.feature_names, + "covariate": nb_lme.colnames, + "subject": groups, + "tbl_sample": nb_lme.sample_names + }, + alr_params=["beta"], + posterior_predictive="y_predict", + log_likelihood="log_lik", + include_observed_data=True + ) + +With this you can use the rest of the BIRDMAn suite as usual or directly interact with the ``arviz`` library! diff --git a/docs/index.rst b/docs/index.rst index f0247fb..b6b294a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -46,10 +46,13 @@ In Python: .. toctree:: :maxdepth: 2 - :caption: Tutorial + :caption: User Guide default_model_example - user_defined_model + custom_model + diagnosting_model + working_with_stan_code + working_with_arviz .. toctree:: :maxdepth: 2 diff --git a/docs/models.rst b/docs/models.rst index 9b3977f..15d9841 100644 --- a/docs/models.rst +++ b/docs/models.rst @@ -14,5 +14,8 @@ Default Models -------------- .. autoclass:: birdman.default_models.NegativeBinomial + :members: .. autoclass:: birdman.default_models.NegativeBinomialLME + :members: .. autoclass:: birdman.default_models.Multinomial + :members: diff --git a/docs/user_defined_model.rst b/docs/user_defined_model.rst deleted file mode 100644 index 1537045..0000000 --- a/docs/user_defined_model.rst +++ /dev/null @@ -1,2 +0,0 @@ -Defining your own model -======================= From 0f6184d5325fd24958ed5b86e296d2f56582ec07 Mon Sep 17 00:00:00 2001 From: Gibraan Rahman Date: Fri, 2 Apr 2021 09:06:48 -0700 Subject: [PATCH 5/7] add diagnosing models docs page --- docs/diagnosing_model.rst | 59 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 docs/diagnosing_model.rst diff --git a/docs/diagnosing_model.rst b/docs/diagnosing_model.rst new file mode 100644 index 0000000..6efcea0 --- /dev/null +++ b/docs/diagnosing_model.rst @@ -0,0 +1,59 @@ +Diagnosing BIRDMAn models +========================= + +One of the most important things to do when running BIRDMAn is to diagnose your model to make sure that it is useful for your downstream analysis. There are a lot of ways to do this and a ton of literature/tools about diagnosing Bayesian models. + +BIRDMAn wraps many function present in the ``arviz`` package. We recommend reading through their documentation for more information. You can use the ``to_inference_object`` method in any BIRDMAn object to convert your model into a format that is compatible with ``arviz``. We wrap a few of these functions for common diagnostics. + +Rhat +---- + +The ``Rhat`` (:math:`\hat{R}`) statistic is a measure of chain convergence in MCMC methods. In a nutshell this metric describes the concordance between within-chain and between-chain parameter estimates. If these values are very similar, ``Rhat`` will be close to 1. You should check the ``Rhat`` values of your model to ensure convergence. + +.. code-block:: python + + from birdman.diagnostics import rhat + rhat(inference) + +PSIS-LOO-CV +----------- + +Pareto-smoothed importance sampling leave-one-out cross-validation (PSIS-LOO-CV, referred to informally as LOO) estimates the (log) pointwise predictive density. This can be thought of as an estimation of traditional leave-one-out cross validation, which is often very computationally expensive for Bayesian models. To calculate this metric, one must generate posterior predictive values in the Stan code and specify this variable when calling ``to_inference_object``. + +The expected log pointwise predictive density ``elpd`` is the sum of the posterior draws log likelihood values. The objective is to maximize this value (closer to 0). However, it is important to note that this is more useful as a relative measure. All other things being equal, a model with a higher ``elpd`` better predicts feature counts. + +An important note here is the ability to calculate the *pointwise* predictive accuracy (at each data point). If you provide the ``pointwise=True`` option, you will also get the Pareto k diagnostic values. This metric assesses how well the PSIS-esimation converges. These values should be mostly less than 0.7. If you have many values above 0.7, it may indicate problems with the underlying model. + +.. code-block:: python + + from birdman.diagnostics import loo + loo(inference, pointwise=True) + +Posterior predictive check +-------------------------- + +Posterior predictive checking (PPC) is the process of using the simulated posterior draws to predict the original values. If the model is fit well, you should be able to predict the counts in your table to a reasonable degree of accuracy. Note that for sequencing count data it is very difficult to predict values with small credible intervals. We recommend checking that the general shape of the predicted values matches the shape of the actual values. + +We provide a function to plot posterior predictive checks if they have been calculated and included in the ``InferenceData`` object. This function returns a ``matplotlib`` figure where the black line represents the original values, the dark gray points represent the median predicted values across all chains and draws, and the light gray vertical lines represent the 95% credible interval of the value. + +.. code-block:: python + + from birdman.visualization import plot_posterior_predictive_checks + plot_posterior_predictive_checks(inference) + +.. image:: imgs/ppc_plot.png + + +Comparing to null model +----------------------- + +One way to determine whether or not your model is learning anything biologically useful is to compare to a "null" model (with no covariates). If you find that your model does not have more predictive power than random chance, it may be an indication that you should rethink your model construction. + +This can be done fairly easily with ``arviz.compare``. This function takes as input a dictionary where the ``key: value`` pairs are the model name and the associated inference object respectively. The result will be a table of the models ranked in *decreasing* order of "goodness". Note that models must include ``log_likelihood`` groups for use in ``arviz.compare``. + +.. code-block:: python + + import arviz as az + # model_1 = InferenceData from model with covariates included + # model_0 = InferenceData from model with formula = "1" + az.compare({"null": model_0, "regression": model_1}) From e323935dd5e713d38368273ec2392d95e360ab99 Mon Sep 17 00:00:00 2001 From: Gibraan Rahman Date: Fri, 2 Apr 2021 18:56:52 -0700 Subject: [PATCH 6/7] arviz + stan docs --- docs/imgs/ppc_plot.png | Bin 0 -> 22045 bytes docs/index.rst | 10 ++- docs/working_with_arviz.rst | 128 ++++++++++++++++++++++++++++++++++++ docs/writing_stan_code.rst | 50 ++++++++++++++ 4 files changed, 186 insertions(+), 2 deletions(-) create mode 100644 docs/imgs/ppc_plot.png create mode 100644 docs/working_with_arviz.rst create mode 100644 docs/writing_stan_code.rst diff --git a/docs/imgs/ppc_plot.png b/docs/imgs/ppc_plot.png new file mode 100644 index 0000000000000000000000000000000000000000..e8c74107549e1b465318ef2b4909e773dcca0f51 GIT binary patch literal 22045 zcmc({d0fu>zbuN2TG}{RnxEb3WNK$` zZezViaF3uU|JDl*4z~7ELPA#m`hcK~-FYD)pY>~SkyWCbC^X$Cl=I1S<*ImDUUCN>7E_eLl8|$Jyc=QK7O0>VX zvEiUcW)$e8J=D*-sT5PkfmJbAtvAt33>C+;3G>RMe`<&Yx?uZb*4_ z;K0C^Y|%Y?m|0lNx^G*^Q{SB!$^^r+T*L;Rn>N5}D;iGYIRPV*Y4Aw2W*P2zMqworS6!NEMaIR?ubWbi=l=5%6ugb= zoS0L5$iu0vA zdpF|RwdJuAx~t=EEnU*-3!|*%kBh&4Lf?T&!!++TdCh1aV$2%4=e+Ns!-q{X^=QSO zo}33Rz3;x5o0}Ud<(TltAAfx5?@v^ZJlgx^?R|0654E+-oSeyCUKEDq%Xdpj>3yvZ z&*r%w5D;~5yFz(GgYLnD2SdfJLPthM5-gnQC~}-~YtAqawB%~q6pjbwIrN@QpNkGQ z{E(Phl(&e^jd!rU*vQCpK9ieAL`41h^XIO!gW1e% zZ1Hd291}R5bt2qzA$Q#t8Gc2*VvoY#e*2Al#j(F$8CR&An;y-yY!o#$GpnDwj|FsQ z*E%-JF4f0LouxM{I!dmHSNLs_O{uHXTFZ6dqki|`V6w;jjCp^3vVYXn)TOSkUz-C~ ztzD}ftMaH)v^mR4Q%=syb-d3ssJAsgd)cyOmgNq3g(pv*n1%d)*n9WBeY)Sjf1hYa z#8TTOad^|S(kgn^4I3g`^PS0nM-M3}D+h8jtX$d5&$Mn`y-@Mo*`V(uBWdQ6{JffD zQ+Nz9^AAyVGhutp&uYeeaau-C-__M+8pNYyZJlCgZ(r4%!Lp3f5!$mXCF;wUbIRc| z28lY!{(gR{xn0L)o_HJUk1GnEib)!2(2R+48SM_CSHeSXWMR=UF2oL_h8|`p!D~H| zv_FbR3chjU#(@I|GF(Q@3MZN@M!tO$KKFj%g!#Yu1x`oii$N? z=vP*X-n)0N{LP#9zMT6p^4T`JxQ}uOrgZm?9k2M)J?33qE?-W(JUP@gbTvCW8^O6g+udY^oB;}ZPsvDc==pFWuiu08Zp5)qpO|WXtqBdtGcfWg-h?z9U z>h_Pqu9BYWxEd;HAHyz2jq+Z*nOdK8s;?zCT{4b)+qOgM>gu7w#(x9}8t^KHN?d$O zw#LL@vy%S`CaxqrAuskLCNu5bqy4d`(#_0YTw8SwfxM%RgO{T7`o5}6{J?;va_C;g z-rn9mT4Hd0s*#YatZaJa9}H{0bap(2sI8x3hmYq(al$h5nJIN3xZk3j6 zg%ET89YV1c z=UJD3Qg3Facr~42^`8D61H_S+xFSx;=f?K)zZ`dD#yjZy8?zoic1-Qip=CygH|-&Ys;F+xi(((&{Hxzx|0{J7qr1m6<$S$grGWBwy4 zYSI6AjO9NjXXB6QWBrjzmmZt4?gmM0XB3v6Drfi7jQ`tEcJx0x_x3l(-VgFf_9K&% zy64VC70vYVje3*{lSOsi+q;~anc%s&Aa&_mM3of6l9*$k76SvrTV$cgqjx^My}!$< zoJ0xi3OYKv)xZ3rhJ^7hT-HW&Cl8O}b_L&n)YMcYxV=_QdlBEXn~UfvUU^pqJ1pK8 zxY!p?eY(|@X%Xx+*c9DtQ*7Yea`6V+h@ormj>4I~M7@Gb&z*)^;_u%5buIheV}MEB zQfAoNezFTEMMOlJ(oAm&>gPnP-+nYMBBF$U4|d34W4Z#ehR>ZlBm`~bK6IN*|TTII&W;Dc2|aoO`gNZYSH^OrO1&-&B&d9u}0{^TmKzCw{B^c`Eq93 zb{>)*t=yB>pOVi_vX_>v?K4D+NMyg0Y;0_L>pVg~ZZ$w%_>g5#`)hP4ZqD!SFAQo8$eV>@nDfXDpaO~Gn)Npev z7#SV4D!=CKouC=3Vitnf$Fp^738SZfUB0sofW*gu-n#hvYzy4xZ|?qHUR9;-=H`|; zv=3|6!?%TDz*fM^Kf`m;L;mPd-d(#C@O%wrTRcg`3+?2WkoXwmhIjXv?%JfNfd>G> z+sMj#GT{XNeJV!r{?6#E<|xGtVWK;-H?Xqq77`LlKXD4nB`_?^qU~K}Wzv8(HgVlA zH!SHa+lva3D=Lph1) zKn4i;cH?iqJzcw5{J|tHWj@+n)mIzq#~qJUG%_}p!E@i=|0&Yc1JiBWw+G(4_k?r5 z(*x5ioA#93w;97kSFYnwJAT~fl-c~u_z31|a;UW;IS~m=^15=(7K}URJKXz8=yqYc}Sm@sHp;trk!^@i+#Lm9_CDW$e z(0}ixZ)Y3_KC0K$)VNM}1v?J4NLGc4DQ(z!Vrw0OY;3nJ-M6vcS|49u9lQWw1yI{2 zEN1Hah=cx7yTaWx8XFr!_nP|ysp4sbl8uTDJv=9djB&&fJ!i+n(xp)8m`(Hdo`jj8p&s3t+Z!bxh$chZe3Y2wMe}v|x z^p>-GDU^2z|0o}p$|JS4KN8^jrFZ%HV09PYy&uuNw)EAN@^tUn{@A*dEMNUdk8jxY zDAJ^aEkBRkcknE7-^{hM6fXgVg~}L)Uv1BL8{@GKu3EK9<@>GQC>(k}{^kE^`! zP&BW9rkVdJwQJGqk&*oQq)soll6TqE^~Rt?e+{e3O^YwfC>%(!a$=qAxj+oEOBmrt zzWR7?&)A9eOY?ALdw96Yk1Xr7#w+=BcD+sUybg7Gdh{y|xbd%&LE5bVC0?4g z-Yg}&D@({C`Vbhf-?~L2Z1#Y(G%d|k2_d&QAq=l#%j=lOfHG9gzb-+WZxCe^>MQM( zjkr&OU#lR4^W4LAaRs%17t+sYRoav0A;p3UH zUXxTBeiIS;QGg;d_WrP5y4jsVos}5r0R4nD5(vqpxGP$iu^{>S@#Dwhlf}r96HOZ_ z64Ir@l|rj-y?WtNE64fe&9Fs2Ly7zBL~l%(wLdqfgl$Q;jDp89N`yG8_Y;(?iWpeqFl+Vwp?&RcCaKDJHVOXV zzGZ?^i>b#vJc>vNN9?=Ml5=sQ>Bm?Dj^oA*Yghtu-QGQvB!y+6IDN!AV{U#*oi9~( z)zS*zct5C$!o`K1j$YWv6oJZu~jB%{7s+^!sd ziSWG`FW7OljSaAi>gjCjK!+XIy8ol99vCRzdAJQ=^%+lXyymVwbBZ7C{(d+hEG!WPlxTk# z{!v+ZB1{&GQQ>ShMFN-?tU_PMUst7^haMv7Id+uL;Wt#|>4r>|QsJ?w&wl;R#j_|w z`?wX4|6Tb9tX;QGe3%dc!NI{O@cH;_qZIFJG+Ta%x~UxE-@J;EF#vx#wN$`2K3v1b z!s3*izZTo?fl?;rH4-ktqJRD=989t`Xu&+7^Td9=uK_%o88+=Qg^LTb(L;PLXQPc- zb)_5A&V{6v8>xnTGOK=^3{M_8E@O z#Hc(<01>Fw?lGGjy3fX1kZIMbdr=w!w&&kH6gk}wNQEbi9UB{4UGY%HJ+IEG_QMCu z@_-f+6!1ua24(6AbuTP7H(`LD8z_pk*Umu>p>*R6>( z-dnGpvh6XuFw~MuNQ9;=t3<*n7#6ycO?YbJzlywcW3I4a2LtpADY759^i0ZWAg-t9 zLgCy<1zAmIRiV+VHtc+#hEUJ@1!)xd(Ae0xsqk{PxaG$P5Sed<88Q7ejQW9@sMZuU z78mBNuw9GvQ7{B^OHY5fU;gf0Jr-5~fGr-+9C)*v18DxtU&~xBUAi#VQ$s$03CKA6 zYPJ13IYqty4&l&{p^AOr8&yvCEzG8<21{?_rcJv%Jv~X-y1HVmzk})O;LSD#>NZ8w zF&>Mv?YdO=Ky&fTll+A8MEugUX4|r5i-2B6+>2FVo5d|J(dys+WF&NxH+bb$3p0-H zDL?yMVl4GCC9{kz&nwYwT?fKX#o%vifBEHoDU)oHu915tlC_r}U=6ATaIYO3w|t{e zgSRgpOR}T1V9uXDeOg}Vi5SqO5=!z>sBRbK67s5OtS8T>&QA4czAJbSiAbr|WgnAH z?E%qP+%h{k*c=P$MaOXu{`TxXpn`F!cWek6$Zd#VqZ_~j8CyOw|zI|?Ic z!20CV<2{jnRAf(aO>Rau(IYkm-;d@7E2BD!v(SIapym(q<_QQU4e&>(3&PK_dXuGB zQX+Rbi3?BQbS|6k$}tTvtVnU@)!Kw@c$aX1@CTH|GE)L0GyRlkbiS|dwO>WqFUR>; zbRR{FWeFwvKI~k{`tLZ2|1RiVE{FU4hpfdi#!@w7<0n<3OZc!)7*P=t1IJ(Y4Xjf0iuHl}ZW!@D$t|q&W7Kk49B-0V1h1g)Q#=-P0NytfzcZmtC%K1rrCE?rdmrS-HL6}6>x z@V>Pt-r8l}B(tE#(An8(>Ye{dzzgisY9^-Cw^Z#vfBePT+M0yIrR#%m1yS(uW0!;& zX2({8Q_4YYxUCSBr&XTU{QSI_>(~YPBS-4a4jy09g|zM&vx2e>@m|N709IGgKhJ3} z0X)=x{i*vL!W56mR%doeyC|ZyJU==1c31ev78_P?#G%Od@6}Ev>x>V#8$l34^^P zClZ$W{}h!)%N|7>=w()((zj!0RrqAP;UE*IlopnQz^F{AaK`^{MAjv7ESGeA&czn$Mb=nlu8!`fdJa5XKKV4z!P$&3Q939j8fbClOT3A|lVT5_RhEE4i;Q)g_7|I;_9I@3_qA)sG~!uu zNcouT`1|j_mseD1y1KevH&ubeL!CD_Hy1ZOZoLbu5MvUpx8hcq^yMrdoyy`bU%pth zC`Gt34^PBbF{x1Ed`4dATJab%o$p!ZltB3iH1)UOl2%Y~z^bj_l>5xsJ#NS0HUm;H znxc?$0CG(~0iG9E%rGfimtF~R1KgFdW_QUI`s0dOk8={NtQ}_ zt=mL#CUmlOepSIdaJiEI>pPocnfY{j5DT=NoU%~nM2(F(jQ7!$pl)a9{xbR>N;8!= z*>;%YLqutn)Vov{r!H>Kl$W31e@m$7JyuX%wv7?O$sM*mM^g(%jyuXSU22KeHQX~R zS3q(?(#!WWfh)VT>e!DhrO)+0vYe)x+SaAe+ac#vu-|*z{4Qi{3I^~3_ zYIexhrh7|7Q$obcvmumE_fxQC*qcbfULX-V`{~)iX?^Bp>y0-K^R@)9mlGswN$pS8>pRlw{I_$WT z&~{!*lDLjRgY(3|Dbw@km1c_Rwi+55ibs0w*|R4wAmCiXpbo>2ECke6xv1;FmXnl} z#GY77TYA`X^}Vr1@;=VvIvXkTb8`c_&w00zlpYW%N+*oyual^)F-`Zrz1;{IYa+G7 zap|vGS6-b}Z%8rNF>gfSm&|q0(h4}N%sBWnOMx%dM3(C6?d9oR)T8{&Q#Zm^-n|)t zv@j3>ZIEyPQG8z+oIuzRLxQBjd-WxS=gdlo%pVZsxD|tijOvTrXU9-0?Qh|J|J%}Q zl>4MC*Vwz`g!-m$*Vk;y`3=J)XJEqW?KIpb5I?7067D|1rb`G;+ZBPUtPn8}L(+MS zL@8$H&-a7qbxLg?-$r?1{R5F9>GO%%{00m^6dUf5M-2fFKDyYYOut<*-)S(-=v4x; za=r-TVX%S_#fX4MRaU=x#44`m_*U7)x%5&8dNK`ikATkBA*+})TeRdnpI-D?b_S#z zXc20XAv0_mv}=MFDJcFDb}B!r@1dPBSv?g*O8{yLp7FKHKMGNY3W4l6ln+!|LSSje9P`DbCfri%ps7M5i$Qur zn@@c|C?D(v++7mj27CEsfEZ<=dfa1$METcao}I|xpv)**ZmrQG=}3d`DMZ_CR+K#AiN zT>jhAP<3{9$H3;Gbuz}HA;r`%anhB|`$^&D$>SQikzOVyCY;@(wo8k(P6XplvNYzR z$_^Yp=BL@6s|0?5P|bH=zJ66o<7D6`JA0iyo5W4^67W0g2!2Ddcf^M7`tpSg0$eNp z2B#N%Hf~YVs5#)}YlqKL{*(ffKKgLOtydZXHntQw8aNAH0Rd%6Nl7qxG%0zXS7$|H zU7-ZL5;OXgoaV!{JAOAss@$}Ld)d~Mdyy-WpWgS|5f4McI!&RH5az;m&fbC@64|KI z+{bCUW=bC+Z+*eU#l^+xg>&cfrjABLY(-E~2hCD8NQHn6zj;`#cT<=U;9^=MYs< zqPSeHD3hz}ZPF~_*LP7Wo*i=()>iNg6E zgKSXZ$leN>PT?+Ci^|H%)fMN!Ma>Qu>q7ja#_3EnpqfqZ&&KWY!1$811TzQCt_pCu z-uPAZEqY2sR^3EbBUPhD=Gu)Vif|om@H7YAuU>bavY05af1&^SIgb9d+J5Tjh^3f- zkdUhX=ixO=f4(2VrhF&i%sCmKHB3yw#U!tl&@Q@Ug(4J4wx3a76Ooz6YIt-4=X4k>}rbZ1Rg2hwL|g1BU`pO`{@{FWkiG1jJye9dKPlD)-S7lybFhf!trrN!FB&qtlVpK^je{qm%Bt5dU;m_g?8Ck(&RQc|kd zmvO(Ls6kb*rXrYOJJ#h~G>Ns^C~H}`E-l`;l%h6u)yuDWmtB*dkx z)ThXq?PG~4C zPoS!HDMEX@QZ)itM+&+phqgu{z}RZ+*wvENAZ0P?BL{%+=HB+`AX;6t@~!HA-IoyV zu_R!;;T>%$-UbCUh`5o0T|BYsz*Y?Z8hj&ho*h}5UDwbeG;*zFz8>emsWSi3nZKi> z$NYH1>Mh}q9}6l{0e?a!0gQQhc;3ncCT~MPEm^g3(ck`>kf}tmf3l@-LPSKw3!P(^ z3%8`y7xMKcT$S{g>D{HsY&0vu$JZa)ufvG|*0KB-uQ-9gkji4I>MFj~+O*^(m-*wo zGBx&!n*{ZR7u%m1m)ccmZ(L`A;Nn%vx4>{@Dn4K0op1aX7jILBl5+i5kv&sE%q)X! zTPvE6z7;)R{Wvq1T~S3vB{1u>o0o!%K?&j;0-bh~Z{e>L1GTqC_BkGb(-8-0%!|wV z5V-{!R?x10*3Z`7dp9znwMM?(LO%R0=y$>N%Et8bh}Hzm^atS=-c-QT5p z+#lgd!#V`yn&eB$uQw?@{s|^^4-$c;dQ*&6MNCMJh<&h_RTvz%T%<-`t6*7+@!2>bme}?wq|Syj_r#JlMRJyFQbx`6SMmw4hZ=j+ zvZa=oNN$kcTKigT4~Lk7rSBF__WZ53q(l)BF!a8|V;Ga_J)bY@x|qQurmt@BkUv+H z!^$%;Q4rn?W)*Y$%zK5}APq%8QXlt+j-YM*X1{HR*I{>Em~b}lFnZ|w@$5ZhL$C?u z9LI}M2%y;LeZAR63kotd*U2|EB0`P;i??s@(Wu%e{yF=UUTANe{}Vt@9fk(@DV1(e zIc*&f%mNqO5_;>!6(p-?FuvMtM$qy5znd!WG%yI?qu|FSAQl%1bK}flcDn-zxkTTA zWFPSB?sMPE{G+<7!V*zUGY2XEBpH%r8UPG8-&n_~f&VH;n>dm!w!sXD#Vn9{0Q&gF zix)}0DSg;*a&Ao<7~f>Q%(Ps&OIBMC?DtOdy$UWGT;Rs_>o2zUJ4UOS``Kz5^nGxt zC(Fk-O@N*GblXK<3+|zDj_Slm?YaiQN8@1fdcio7P%urUdC~Cao)fS zVyEgLuoa~=9CN&09;!80Xr8H;nm-;Nqt!L{&VRP)sr(w$oCc!{L30NZD>rWM$_&!p z`6shGK$b(?2f7r@0UWaMw&?5VTo@f0(SS*^4$&T%sTZO@$(fMVV^QB!oPACli83-W zr2O(&m_GkV#=WUvPEUEJE7)k`=FN!#0ULAk^DRIIBU0w}edaJ-CwG%_GM0Jq%r!+1ySF5tIKMedhF9C`8_tGSm#>eo zoGwOlwZo&*m((Wqob}7+6_8#m;5@xpb(&syRIkv{h)m`b9jcJL7 z0SN3`@MIJT8stR*??WEr#U4U7PUFE)JcXUsajZP5#iA}Hjc^sicq%_=^)wJj&pN6LN zg4lD;iB}@on!yv%FgQlvq zX}TMP0+3G@A%4j5+9+dOW~oKzSt9%^ZPvZ8D~h0hikmJALoQf4l&NP&H-aI3_Wk7XCnCpYy$lrgmGUon6g z7>ZMBHa6n0-!GOc^eWpibFn$y(zITCqrQ(B=WfIHPuUf-dVoA3kghalsX_ zwW=AQTU^scOB-T=WK`SP=RMF9in<-Ybq+je&V6hpF~VU^@#5o1p4Ht((bjmvN^?s~{l3-BgZ9$y}+A@xu3;`}gJEV9za5cv{d6hh2}z_uF>is9l#kC0$uane-@hJ$PG%18QG zK4zg|PkZYSr@jmh3cxe~(dw7HgshJV!#4$kD}YzaXI z-KvC94O_52V%lOx64rY`0Lf+k!~go9_b&Wwap=^rr6u(o;=_Dp>Tdv$*4^0U(_a{| z866*}oE1s6NX^}e$|(hPnueTQiI}Xf1)Xj>AT?20A;jK)b@ut}4EPWLgc!uK6fyl9 zw)*{CzUHkjFD74p*!2LA7;*rybXE;ERrf?PJPSqgiHCKZR>TQ8?}o>dPc#4gO6LPZ z)e58aOP?fO*yZCC8(O%@V+f1@^ z??1vIf3aG6l2tiOTns6oN)#*o&l==%6D;D_P`d9B!m_6+Gx6f|h-vx>uuV0f!cCCj zUL&mD0aL+7RLJ;N2cE;;j3N+#t)YCEd`v&@C*n3`yZyj{mK+#vPI-UsbCa1L)BFsY zQkd9YIPkicpd_w^=KmQSueY)7iASv{d~XK{oMtEQPu@rHYYzJgy7!8$b`=7zxBDG9 zo1TP9_BCRqwvyN(m*Xo+u-3M7bHBU<$6l2wver?kBi|w7vwIt}qCsUJY*lw2JYY?) z1g*fhZ{NP{5SGx-Lk5hS4VZ1olFH_P8<*7{HS7d_@#Z$cSmSy0`~9JBeg;15ejt^L zRT>Gq<7-&+Z=*vagzoNVR|bO;6}U=O(Q-kAqpFd=uCBDe*ko$)Y1M!KToxD_O0}Ou zw4$XN$&OCFDl0?ZLVL`K3m4*G$v79(m}M14e2g$Frk?8bU8Zc;9LH7Eo9eO1yJp+yDP>j?N>8nOM zI?f=h*J~uJosut$OZq(dzY`l)rCz0)v3hfP&Rv`zaxO+W`-uCZ{-ef)`B~E7^6()C zp517R=iGC!SQk8Q+Mjr&iiHQxFZt1a=b${l$v)=ga=QRuEUhD&p&IZ7r-5x(#t6U`C$43{YX4A~RPp>>2;wOS=F}VYrfZp|37}2IT)Z zrY-@Iq}$Fn5jb^31FZ>lkqS((a;Spbp)Jl;FJjec!KIAk5=o35NMc-ETulWo>4ujF z#Yo7l&Q08+yDVY@`TqZsO5%0mlHl{?!3yl2`f0}P{@&kKn2&V_3wl&k52nd11i;4h z+_?)tPG%t(0>16ruQnVMLzA6m%n5*e_b9j&jxH4p;h*)K;fHH!(f5tNPpqmF!|kck zmnR;05?}Gh=O&qGgaXQ$+bD3^5nR2IrA3?uPHud$SEs9{90Uut*3|hKS<~lOxH`!R8JNa07(a#TD*dN?}amygCI4DFKMWsq2mD( zU56c9QkBJCUKF^1N3&vMW4T?81AwJM3Z~ zj21>VX*-yYqY_YlD4~v)p?j!Yt&8WK23zwd6CABT_$a#{K6L2I=g->}^`4)-j=M(? zt1I|2c%5{SBG_!dLe~!D)9sU8oc3#U%`vat`AFj8A<)?VjDshCU@o@H23w zTBXkuyC4>Cd3kwWP%Nw0RYFCGAf7|-%a>W4vnPeehFvOJ=;MHu-CEPSt2_|>V((E5 zjyGAPn*a3&18K^u(2q4{WtVXydhR%o78F@SvanV#vPPh6(vB|*7Bu+4ms;?2vhA{J zyZhuL;-9pr=U4oYbSix>MEyM*W1^zs;PmP?S)BVSYXLJI%nWs&^9`Q8ex7rYB*mpK zVAT4VJg2TAlO|}XQr5kO+c?W@+77!S;)N)Z=N_|n56#<#8dAeSL>r?4F0^6+m)ne= zx(8VV^jO!h@FNciWA$B_nw{N@R~7Fs?DXDz8?A^z`38#Ketv$P{rwiB&LB+Ehx~8D zRQ%Sh5=4;o@|7#YvdivlKHmcWn0UW5aF}W@EVVEK305vP0Q-U(y6Tb-<^+}LiRg(;@GRol zHo09aq!ye3i)ti2UZ?3gTsXOri!0@{O<|@+KYhmmaBymsq*D^+Fw&k$QVTkCNLvj= z@_xhfu<$_fOaM*^%p0cCOJQx-!69WgEF@`Limh}>@a)AMhm*NUO-CAZ9=rDOko_^) z?pdt-oo?)LBW)|L4>&=5VKM6lbwo>kH_j# zBleQyAyZODK%9uKXumvM=sNOt7g=(>FkT{m>5J8`le02Li`Yh(Z#Tm)cn~g6Z73aF z9#h6h#6i4hPD~<&1?aqbFyyJEO#qCOX^K4t&j%n65j-ASs23s`u^udp1Q%a`#>ZU$ z6=MS5X6AH0+=9`Qv9OSXBn?LtNJ6CLMHA$Ukt>yY(NT7Qtz`M9~GDsKi1=MT$3Oz00fZCwMOA?cpNW`ADksK?LZ4%C>3!e_jM zo{2~xGoQCC#$o57l+1ZC)8e@}Pt>KV_NVflqz@L^ zx)-$e&N(_qR^rD}gioa~D$F{MSSLcIFFRT3u9Oq|(fs!6M_<}Md$<2xBlrJ-9}b{t z!ImE+HeA%$3vq)9D1#!A+hKb)Y^KFQ;(h>N5P*&wrC>ptlQ!%b7ZK)=Np<0Ow8-bB zy@Mv9-W(`5$OnHD7975rLa4Ed3MH0X(OHQM=;j6x0INwT44CRpe?7g-obFl_U$Q4y zOwfRuAU2oU^6PT&MWUpZS68Qw(%n6{^;u{4YbDZ;s~`XMmd#~kE11fzRa}6lltobg zWHaqqSYDRwvv5qk1uWd%gZ`<^Fkq?5Oh>hph5rc2Xs3?Pl9IZWD6JbO1BB#{DgzA< zI~4hJlGNa&q-Lr(uA=+Z@sOQe8lg={t%ipOMfILcB^u;tPBgVOu!WJP^2`f0hd^(^ z;ZXyDz^a_2G?rVhx>I3o7B?j@lbEBygri;id9DovRrr_AMfcr@#WvW`RS_#r8}AF$ z8wL14ogV2TFWdA^=LIe&cJb3oYP-tC9mNHFCUaBY^G4t82NB?S3gJQJp~Mpy?#b}* z^7AK6Sg{n|K*qbLF-h89F()vBWH(O?<$mSjl(c(k-EZ&6xa>;pEShr&&_%CbFX}#B z6x9yjIu1&If4KpODHTd&ED(SFr9zbX@aRgUrm-AAjFAFujkW=43XxPs2bqT2nZ&9d`&qQKWq2I$KeCA9VLHWd}SGPs;c&0Ko-INi!2jvLk|xk(x53TFeJoW za1_mwxI63)369OM>ynmuxJb@r>g`ZOf^l7(pCr7F5XtLFpFdeGq$vOiI#SZUn-uVb z#8W;6CIA%41J75UySaDnBz>ZU&RH0Xm3<2{1+flr_ri2j2e%CGI72o}Xr$1`EJR`~ zl1Qu}oj>Hv0ry!a!UTT^v~K3njOhWI)`gh2Vl6w|^v909hblzI1o{b6}bClYurjBZJn&+3^6UC7gFUoE|wkx`4Rnleu39xU?iyjyzfzE*|BV)&tK zsvsW-(w_1u0*cq*)4O_)SM=a)1+tNV0Wr?1`Od>BV=|I|KxZau-Of2h4KuTt`V@o2 zmoHyZTNkSKwG(=h*xl~ug%B;p zU_Vw9kuk-~PV%-gxB;#ZU;FBH>vZr|wDj|;?ojN+{mc=RNYDF9u1j)&=CEEiApe^d z?<%>up3+%|?H!98h|>oYqdnN*N4qfa@%aogiPqlME+*mINwofB z!=8($fu!Dlh*okO>rr<{ZYE9hxkY53AsbyqV=d`PXvGwh#zO?)Dp3Lf2^ESyj(#NZ zJU~HO#6fyekeY>nz$FR^08St`IragxMvZ}UYc%OxB8)VUJZTmf6(NnWD7D5gY&{Ur zvfD0wUP}%xSWvydspha_Gb|EeT?mDzvn-#V$^rx?T|bC-wP=2{d|aCXD<*P0jyKRn z{R!%tM$kiJnmnc{Yx<(vJy3~D=R+8I%aeoyDA;D0YZs0462{;?ztH<3nlOq`+Z}3^ z_nG5Tq`esDBoJrnsEF!;@7IdvCr%SNi^Lvrt0tT$F_l=~kAOt;T%7c@K%^nBJ{~JO zAK&s!EyPvpHL&McIbUiI#HL!T;#epvqy=*cA)|e=iSatQvPkC{&p_=gq2GShiwJ3xoHKrc)rDO{I2=)y#a%{>EXTL4rUIYq_9PdWz3OgC(w zO^`#^-pBbvc}l?GwO|?$tC@P^V;?8j=0nOQTXqaZXFuRr9hz%M=hO~TG=|A~7P*e~ zJVCLnj?)c3MEc2)(^?wMhaYV+D2Y?^t4b_WmX2g*tm^xnViJqEBQO3v?YhNgA z(dadY1H_-$97YdE6W9b4lL0hK+7V#VB`2ML34k#x9$SpiS^g<=AKVa-9Y?+>BC3MK zzRnCm426XSdOQ51=H}cAzrWtx2k!0Ejmty13KsQAHCTYVd&@Uzqs?WgXx3U$_fRk5 zROS5%G~p&T7+zM!j;rp?&B zAFduz+vmS%d2S8l2xG=mJD&S4pyuin$<|XG37oBF39=prki=5qntekP7W*Zr! z=yJhRX;Cj+(T#Tr(zcMJ=ivA-MT$F&{&%^42r>8vlzRD zS6EmLtFHdf_1k$fRWq?W;8otQyoR3A7Ih{0LCcCiY&n7*lRZKl# zKS;>*k3y+kt@rHA6|&KZ9j-LJ6J=GL8d~AO(rP!;a;)VAJmrlzMn)JOt$@q}&|4Me`l0^uUb_m(czT= z3a0dDfH^C_?NS7gDWobn)y$SwS67?4sHpfV_ZxyNQF=7F4?I#KZsI!8c;1xNyUlaa zJ*=h%4QVwbqq~n*?UQn!$pB1rKxozwIOdy>CJ;z#eaeOmOwRhjCJ|*k4Ca<#2p3P7 zmWeHN3y(Ysh{~Yv$jakGt8f^w@=|-_;*Oy|n=NZ&?m%M%1TE)~zSQ&ncZL-!n%r0ewZ^V&fbxQyS|A?IK$Jle z;wDXAgyO^;k!l2KK_k6%)1pR(UUGENz*4ntTe_BjN3x?q8h(t)k3uIhBm>(J10R5Q za$bb8v_;qrg+Vt<{R$_WoX!=**iBe!UQTk4+|)-JD1M)p)IMJdmYDSMJ(Md%4?OL4k7QI&U-c>C@yGpMzId^b$QB z`N_kPIcd3mg%>bbMGZrw_Eaf91ad7l@Jijxu|_oB&{ko&M%yhp)COFq315^fJ9 ziJAFfPtU4x4(twc0qE2fuz+D!_YLPycZWHm?6=F`w|(0-2dFfF7!?ElGHz4f#FSHw zRRepy9XBGJAwZL~0oCSwZK2+hD-+IrQTjVL1Ps2N?&|PVbl)Vx*72|?x~F}lNDT#J zB52{noHw<#n$S|Zy#nB1C)JhNUG-CN z$vJ>vYitqhKk%x&uGA1r4*G%BsLfm4zFoob3L0i5DaR1n9l_~RQ7w>mK;%fMx2~{)>J4D1UvyQlM1fz z+lw`3jl{r5+M<0qB%d}Ic^WLvG!*Ye|EQww(FcX7J4ppd91%!^mq~?E^Yl7Qm$4w8 z`#^x#5>lVE9w2tipoKFYy>lZ8jvR=dL^|1Ic~$GAN6v{s zo}^*a6Tsjd_XpVn7wB%~2f_!*2~+@zT&_L8#~h9;K|x6TGvpT7*?EIm&2?~nv;ryl zr?{zC0<-}V)B>NPn%a4XLz3`U-NA|%!Dwj$;E=VZsQYfmJft#G&LH1rIt}V!juKz!6riiAza`gx2C6|6U>)fIsK?S-B8=b-B9IU` zC2rT_ONTb^v#!HgT6@8BA6J$h1B0zZwmlhB(!jXiY2cZpO`l$$u@C?xUSO73hT%xo zfq#P?V+9DYRPO-IH((|+aab2Qatzj-f<7szDT$Ew=7AzUU=}RwjgY>{!8hcPHVhYm zo{#$6(U%hgC++HrBl7aJH(FYtM?vUnp)iOy{0+J~0l0qB6AyG!^ob|8mq0@VfE4i8 z$NdRsj(0qPAeci~Hz0uvI4XhJIvr6B=fwDtbLqg0>*DB>C#J*fzfwBKNMj_Db+7kk z>-WfHk(;e^4uU|dv&wFbfr8k0Vo(I+D3DYlh(c29i+L`%jW$q03uNG=xbeYeT~fpW zM~?$^gnXS^A~IW@!$B#>gYUM0>k$^8e2}iZ+GT)X#NE^V%gb%#TZj#$5Sm|@EY^iD zYiuCBx+&ubxdN%?Ku42P?|w4-&&z6ccR`nWAdrIsINW10znygHn&}eGm*`IWhjI>+ ziUkBk9qK$he6p5vBdIaq93&^Ub*G)S&Xq@HJ ztF3$Lc7N|Bg-7@A-;cz8P6Uw=9aD`C-7JXPs7`SDPQ}AvU&Wr(M||#^?8r$ND(|YQ zlJ#O;54M(8qeu7u=|s?vXVfGpCEB?koY86Nzc8#JiPATrD-i8NeNiqKEMWDKC+MFtCC( zlox?=YRA8?X{D!h@FK?RcdzCpzH7>+&+H`_ +* `xarray `_ + +``xarray`` +---------- + +An ``arviz.InferenceData`` is essentially a collection of ``xarray`` objects so we will cover ``xarray`` first. ``xarray`` is a Python package designed for efficient and elegant interaction of multi-dimensional data. In Bayesian analysis we deal primarily with data that includes both chain and draw as dimensions in addition to the original parameter dimensions. For example, in a ``birdman.NegativeBinomial`` model, the :math:`\beta` parameter will be output with 4 dimensions: chain, draw, covariate, and feature. Dealing with all of these dimensions in simple NumPy arrays can get confusing as you try to keep track of which dimension is which. ``xarray`` uses named dimensions and coordinates to make this process much cleaner and intuitive. + +``xarray.Dataset`` +^^^^^^^^^^^^^^^^^^ + +The ``Dataset`` is the primary data structure that you will be interfacing with in ``xarray``. A ``Dataset`` is a collection of data variables (for example parameters from posterior sampling) that can have different dimensionality. As an example, here is a ``Dataset`` holding posterior draws from a ``birdman.NegativeBinomial`` model: + +.. code-block:: + + + Dimensions: (chain: 4, covariate: 2, draw: 100, feature: 143) + Coordinates: + * chain (chain) int64 0 1 2 3 + * draw (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 91 92 93 94 95 96 97 98 99 + * feature (feature) object '193358' '4465746' ... '212228' '192368' + * covariate (covariate) object 'Intercept' 'diet[T.DIO]' + Data variables: + beta (chain, draw, covariate, feature) float64 7.142 3.549 ... 0.4587 + phi (chain, draw, feature) float64 0.1216 0.3286 ... 0.8205 0.6151 + Attributes: + created_at: 2021-04-01T22:37:03.663497 + arviz_version: 0.11.2 + inference_library: cmdstanpy + inference_library_version: 0.9.68 + +The ``Dimensions`` descriptor shows the names and number of entries in each dimensions. The ``Coordinates`` entry holds the labels for each of the dimensions. In this example we see that the chains are labeled 0-3 and the draws are labeled 0-99. However, the features are labeled with OTU IDs and the covariates are labeled with the entries in the design matrix. The ``Data variables`` entry contains the actual data (in this case parameters) and lists the dimensionality of each. Note that ``beta`` is of dimension chain, draw, covariate, feature while ``phi`` is only of dimension chain, draw, feature. The ``Dataset`` is a powerful data structure because it can hold data variables of varying dimensionality. + +``xarray.DataArray`` +^^^^^^^^^^^^^^^^^^^^ + +A ``Dataset`` is simply a collection of ``DataArray`` objects. Whereas a ``Dataset`` can contain multiple data variables, a ``DataArray`` contains only one. If you want to access the ``beta`` variable from the above ``Dataset``, you simply index it like you would a dictionary. If you have a ``Dataset``, ``ds``, with a data variable ``beta``, you would access it with ``ds["beta"]`` which returns: + +.. code-block:: + + + array([[[[ 7.14216 , ..., -0.03673 ], + [-0.639199, ..., 0.958811]], + + ..., + + [[ 7.071049, ..., -0.374691], + [-0.61982 , ..., 0.5009 ]]], + + + ..., + + + [[[ 7.096262, ..., -0.281968], + [-0.607823, ..., 1.190807]], + + ..., + + [[ 7.185024, ..., 0.038614], + [-0.671318, ..., 0.458722]]]]) + Coordinates: + * chain (chain) int64 0 1 2 3 + * draw (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 91 92 93 94 95 96 97 98 99 + * feature (feature) object '193358' '4465746' ... '212228' '192368' + * covariate (covariate) object 'Intercept' 'diet[T.DIO]' + +Selecting and indexing data +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Manipulating data in ``xarray`` is a bit more involved than in NumPy. The most important thing to keep in mind is the notion of ``dims`` (dimensions) and ``coords`` (coordinates). Dimensions are the *names* while coordinates are the *labels*. + +You can use the ``.sel`` function to select specific slices of data. To extract all values from just chain 0, you would run: + +.. code-block:: python + + ds["beta"].sel(chain=0) + +You can also index across multiple dimensions - if you wanted only values from chain 2 from the diet covariate you would run: + +.. code-block:: python + + ds["beta"].sel(chain=2, covariate="diet[T.DIO]") + +This also works with multiple values for a given dimension. As an example if you wanted to get all diet posterior samples from just features 193358 and 4465746 you would run: + +.. code-block:: python + + ds["beta"].sel(feature=["193358", "4465746"], covariate="diet[T.DIO]") + +See the `documentation `_ for more on indexing and selecting data. + +``arviz`` +--------- + +An ``arviz.InferenceData`` object is a collection of ``xarray.Datasets`` organized for use in Bayesian model analysis. Each inference comprises several groups such as posterior draws, sample stats, log likelihood values, etc. ``arviz`` organizes these different groups such that they can be used seamlessly for downstream analysis. + +If you run a ``birdman.NegativeBinomial`` model and convert it to an inference object, you can print this object and see the following: + +.. code-block:: bash + + Inference data with groups: + > posterior + > posterior_predictive + > log_likelihood + > sample_stats + > observed_data + +Each group is an ``xarray.Dataset`` that you can interact with as described above. You can access each of these groups with either attribute notation (``inference.posterior``) or index notation (``inference["posterior"]``). + +Saving and loading data +^^^^^^^^^^^^^^^^^^^^^^^ + +It is useful to be able to save the results of BIRDMAn so that they can be analyzed later or distributed to collaborators. The best way to do this is to save the ``InferenceData`` object in the `NetCDF `_ format. This is a compressed format that works very well with multi-dimensional arrays. + +You can save and load fitted models with ``to_netcdf`` and ``from_netcdf``. + +.. code-block:: python + + import arviz as az + inference.to_netcdf("inference.nc") + inference_loaded = az.from_netcdf("inference.nc") diff --git a/docs/writing_stan_code.rst b/docs/writing_stan_code.rst new file mode 100644 index 0000000..086e79c --- /dev/null +++ b/docs/writing_stan_code.rst @@ -0,0 +1,50 @@ +Writing Stan code +================= + +The core of BIRDMAn is running compiled Stan code that defines your model. While we do provide several default options, one of the key strengths of BIRDMAn is its extensibility to custom differential abundance modeling. This page gives an overview of how to write custom Stan code for BIRDMAn modeling. + +See the `Stan documentation `_ for more details. + +Structure of a Stan program +--------------------------- + +Stan programs are made up of several "blocks". The main ones we use in BIRDMAn are: + +* ``data``: This block defines the types and dimensions of the data used in the program. Every variable present in this block must be passed into the Stan program from Python. +* ``parameters``: This block defines the types and dimensions of the parameters you are interested in fitting. +* ``transformed parameters``: This block defines any transformations to perform on the data. For example we use this block to multiply the design matrix by the feature-covariate coefficient matrix. +* ``model``: This block defines the setting of any priors and the fitting of your target distribution (negative binomial, multinomial, etc.). +* ``generated quantities``: This block is for generating any other results such as posterior predictive values and log likelihood values. + +Data +---- + +Every Stan program you use with BIRDMAn passes the following data by default into the data block: + +* ``N``: Number of samples +* ``D``: Number of features +* ``x``: Design matrix +* ``p``: Number of columns in the design matrix (computed from formula) +* ``y``: Feature table values + +Any other parameters must be added to the BIRDMAn model with the ``add_parameters`` method (unless using a default model). + +Parameters +---------- + +In this block you should specify the parameters you are primarily interested in fitting. This includes coefficient matrices, random intercepts, etc. Important to note that if you are fitting feature coefficients you may need to use ALR coordinates. For example, in the ``NegativeBinomial`` default model we specify ``matrix[p, D-1] beta;`` as ALR coordinates remove one feature as reference. + +Transformed parameters +---------------------- + +The ``transformed parameters`` block is used for defining operations on the parameters more advanced than just defining them. In the default models we use it to wrangle :math:`x\beta` into a format that's easier to use in the ``model`` block. + +Model +----- + +This is where the meat of the computation lies. The model block is where you assign priors and compute likelihood. The parameters and data should all come together in this block to sample from the posterior for your regression. + +Generated quantities +-------------------- + +This block is primarily for calculations in addition to HMC sampling. In BIRDMAn default models we perform posterior predictive and log likelihood calculations in this block from the draws generated in the model block. From 4b41bfb46676b6e22c171d0d7320c2cdac45536c Mon Sep 17 00:00:00 2001 From: Gibraan Rahman Date: Sun, 4 Apr 2021 20:52:12 -0700 Subject: [PATCH 7/7] add bayesian inference doc --- docs/bayesian_inference.rst | 48 +++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 docs/bayesian_inference.rst diff --git a/docs/bayesian_inference.rst b/docs/bayesian_inference.rst new file mode 100644 index 0000000..051af1b --- /dev/null +++ b/docs/bayesian_inference.rst @@ -0,0 +1,48 @@ +Bayesian inference +================== + +Bayesian inference is a technique to learn about a given population through Bayes' Theorem. Given data, :math:`D`, and parameter(s), :math:`\theta`, Bayes' Theorem applies as follows: + +.. math:: + + P(\theta|D) = \frac{P(D|\theta)P(\theta)}{P(D)} + +The component parts of this formula are usually described with the following terms: + +* :math:`P(\theta)`: *prior* - our "current" knowledge of :math:`\theta` independent of our data +* :math:`P(D|\theta)`: *likelihood* - how likely is the data given our knowledge of the parameters? +* :math:`P(\theta|D)`: *posterior* - the "updated" knowledge of :math:`\theta` after seeing the data +* :math:`P(D)`: *evidence* - the probability of the data over all values of :math:`\theta` + +Our goal is to hone in on values of :math:`\theta` that best fit the data according to some statistical model we specify. Importantly, we do so by providing both the *data generative distribution* and the *prior distribution* of the parameters. + +One of the biggest hurdles to Bayesian inference is that computing :math:`P(\theta|D)` is hard and more often than not unattainable via analytic methods. Instead, we can *sample* from the posterior distribution enough to get a reasonable estimate. We can do this because the posterior distribution is proportional to the product of the likelihood and the prior. Notice how in Bayes' Theorem the evidence is not a function of :math:`\theta` (our value of interest). Computing the evidence is infeasible, especially for 'omics datasets, so it is fortunate that we do not need to do so. + +To address this, scientists often use Markov Chain Monte Carlo (MCMC) methods. In a nutshell, MCMC uses dependent sampling of a distribution to attempt convergence around some parameter estimate. + +Markov Chain Monte Carlo +------------------------ + +MCMC is the harmonious convergence of two techniques: Markov Chain and Monte Carlo sampling. These two individual components have their own independent uses, but putting them together has been a revolutionary development in statistics and modeling. We will briefly describe each part of this procedure. + +Markov Chain +^^^^^^^^^^^^ + +Markov Chains describe stochastic processes of transitions between a set of states. Importantly, the probability of moving to the next state is *only* dependent on the current state (often referred to as "memoryless"). This means that the previous steps in the chain do not matter for the purposes of any particular step. Markov Chains are very useful in predictive modeling, for example when predicting words in a sentence. + +Monte Carlo +^^^^^^^^^^^ + +Monte Carlo sampling is a technique for sampling from a distribution, often in cases where computing the underlying distribution analytically is not possible. This is regularly the case in high-dimensional space where the analytical solution comprises multiple integrals. Instead, we can sample from this distribution to approximate a given value or values. + +For a good explanation of Monte Carlo simulation, see the classic `Buffon's Needle `_ example for a good demonstration of Monte Carlo simulation for estimating :math:`\pi`. + +Sampling from the posterior +--------------------------- + +Combining these two procedures, MCMC uses multiple independent Markov Chains to sample from the posterior distribution. There are many algorithms to perform this sampling but the one we use in BIRDMAn is the No-U-Turn-Sampler (NUTS) that is used in Stan. When sampling, the individual Markov Chains first "warm up" to reach a stationary distribution. This is important as if the chains are not sufficiently warned, the chains may not converge properly and they will not estimate the parameters of interest properly. + +Resources +--------- + +For a good overview of Bayesian inference, see `this `_ article.