diff --git a/README.md b/README.md
index 576e5bd..fcbc5d5 100644
--- a/README.md
+++ b/README.md
@@ -14,11 +14,12 @@ Currently BIRDMAn requires Python 3.7 or higher.
 ```bash
 conda install -c conda-forge dask biom-format patsy xarray
 pip install cmdstanpy
+pip install git+git://github.com/arviz-devs/arviz.git
 git clone git@github.com:gibsramen/BIRDMAn.git
 cd BIRDMAn
 pip install .
 ```
 
-You have to install `cmdstan` through the `cmdstanpy.install_cmdstan` function first.
+You have to install `cmdstan` through the `cmdstanpy.install_cmdstan` function first. See the [CmdStanPy documentation](https://mc-stan.org/cmdstanpy/installation.html#install-cmdstan) for details.
 
 See the [documentation](https://birdman.readthedocs.io/en/latest/?badge=latest) for details on usage.
diff --git a/birdman/visualization.py b/birdman/visualization.py
index 315ab90..8a41684 100644
--- a/birdman/visualization.py
+++ b/birdman/visualization.py
@@ -7,7 +7,7 @@
 def plot_parameter_estimates(
     inference_object: az.InferenceData,
     parameter: str,
-    coord: dict = dict(),
+    coords: dict = dict(),
     num_std: float = 1.0
 ):
     """Plot credible intervals of estimated parameters.
@@ -18,8 +18,8 @@ def plot_parameter_estimates(
     :param parameter: Name of parameter to plot
     :type parameter: str
 
-    :param coord: Coordinates of parameter to plot
-    :type coord: dict, optional
+    :param coords: Coordinates of parameter to plot
+    :type coords: dict, optional
 
     :param num_std: Number of standard deviations to plot as error bars,
         defaults to 1.0
@@ -28,14 +28,14 @@ def plot_parameter_estimates(
     :returns: matplotlib axes figure
     """
     posterior = inference_object.posterior
-    if len(posterior[parameter].coords) > 3 and not coord:
+    if len(posterior[parameter].coords) > 3 and not coords:
         raise ValueError(
             "Must provide coordinates if plotting multi-dimensional parameter"
             " estimates!"
         )
 
-    param_means = posterior[parameter].sel(**coord).mean(["chain", "draw"])
-    param_stds = posterior[parameter].sel(**coord).std(["chain", "draw"])
+    param_means = posterior[parameter].sel(**coords).mean(["chain", "draw"])
+    param_stds = posterior[parameter].sel(**coords).std(["chain", "draw"])
     sort_indices = param_means.argsort().data
     param_means = param_means.data[sort_indices]
     param_stds = param_stds.data[sort_indices]
@@ -45,9 +45,6 @@ def plot_parameter_estimates(
     ax.errorbar(x=x, y=param_means, yerr=param_stds*num_std, zorder=0)
     ax.scatter(x=x, y=param_means, color="black", zorder=2, s=5)
 
-    ax.set_xlabel("Feature")
-    ax.set_ylabel("Differential")
-
     return ax
 
 
diff --git a/docs/bayesian_inference.rst b/docs/bayesian_inference.rst
new file mode 100644
index 0000000..051af1b
--- /dev/null
+++ b/docs/bayesian_inference.rst
@@ -0,0 +1,48 @@
+Bayesian inference
+==================
+
+Bayesian inference is a technique to learn about a given population through Bayes' Theorem. Given data, :math:`D`, and parameter(s), :math:`\theta`, Bayes' Theorem applies as follows:
+
+.. math::
+
+    P(\theta|D) = \frac{P(D|\theta)P(\theta)}{P(D)}
+
+The component parts of this formula are usually described with the following terms:
+
+* :math:`P(\theta)`: *prior* - our "current" knowledge of :math:`\theta` independent of our data
+* :math:`P(D|\theta)`: *likelihood* - how likely is the data given our knowledge of the parameters?
+* :math:`P(\theta|D)`: *posterior* - the "updated" knowledge of :math:`\theta` after seeing the data
+* :math:`P(D)`: *evidence* - the probability of the data over all values of :math:`\theta`
+
+Our goal is to hone in on values of :math:`\theta` that best fit the data according to some statistical model we specify. Importantly, we do so by providing both the *data generative distribution* and the *prior distribution* of the parameters.
+
+One of the biggest hurdles to Bayesian inference is that computing :math:`P(\theta|D)` is hard and more often than not unattainable via analytic methods. Instead, we can *sample* from the posterior distribution enough to get a reasonable estimate. We can do this because the posterior distribution is proportional to the product of the likelihood and the prior. Notice how in Bayes' Theorem the evidence is not a function of :math:`\theta` (our value of interest). Computing the evidence is infeasible, especially for 'omics datasets, so it is fortunate that we do not need to do so.
+
+To address this, scientists often use Markov Chain Monte Carlo (MCMC) methods. In a nutshell, MCMC uses dependent sampling of a distribution to attempt convergence around some parameter estimate.
+
+Markov Chain Monte Carlo
+------------------------
+
+MCMC is the harmonious convergence of two techniques: Markov Chain and Monte Carlo sampling. These two individual components have their own independent uses, but putting them together has been a revolutionary development in statistics and modeling. We will briefly describe each part of this procedure.
+
+Markov Chain
+^^^^^^^^^^^^
+
+Markov Chains describe stochastic processes of transitions between a set of states. Importantly, the probability of moving to the next state is *only* dependent on the current state (often referred to as "memoryless"). This means that the previous steps in the chain do not matter for the purposes of any particular step. Markov Chains are very useful in predictive modeling, for example when predicting words in a sentence.
+
+Monte Carlo
+^^^^^^^^^^^
+
+Monte Carlo sampling is a technique for sampling from a distribution, often in cases where computing the underlying distribution analytically is not possible. This is regularly the case in high-dimensional space where the analytical solution comprises multiple integrals. Instead, we can sample from this distribution to approximate a given value or values.
+
+For a good explanation of Monte Carlo simulation, see the classic `Buffon's Needle <https://simonensemble.github.io/2018-04/buffon>`_ example for a good demonstration of Monte Carlo simulation for estimating :math:`\pi`.
+
+Sampling from the posterior
+---------------------------
+
+Combining these two procedures, MCMC uses multiple independent Markov Chains to sample from the posterior distribution. There are many algorithms to perform this sampling but the one we use in BIRDMAn is the No-U-Turn-Sampler (NUTS) that is used in Stan. When sampling, the individual Markov Chains first "warm up" to reach a stationary distribution. This is important as if the chains are not sufficiently warned, the chains may not converge properly and they will not estimate the parameters of interest properly.
+
+Resources
+---------
+
+For a good overview of Bayesian inference, see `this <https://towardsdatascience.com/bayesian-inference-problem-mcmc-and-variational-inference-25a8aa9bce29>`_ article.
diff --git a/docs/custom_model.rst b/docs/custom_model.rst
new file mode 100644
index 0000000..40978e9
--- /dev/null
+++ b/docs/custom_model.rst
@@ -0,0 +1,315 @@
+Implementing a custom model
+===========================
+
+One of the core features of BIRDMAn is that it is extensible to user-defined differential abundance models. This means that BIRDMAn has been designed from the beginning to support custom analysis. Custom modeling is implemented through user-written Stan files.
+
+Here we will walk through a simple custom model and how to incorporate it into BIRDMAn. We will use data from the study "Linking the effects of helminth infection, diet and the gut microbiota with human whole-blood signatures (repeated measurements)" (Qiita ID: 11913). Samples were taken from subjects before and after a deworming procedure.
+
+This paired design introduces a wrinkle in traditional differential abundance analysis: `pseudoreplication <https://en.wikipedia.org/wiki/Pseudoreplication>`_. It is statistically inadmissible to fail to account for repeated measurements as in the case of longitudinal data. One can account for this as a fixed effect in other tools, but really subject-level variation is likely better modeled as a random effect. For this example we will specify a linear mixed-effects (LME) model where ``subject`` is a random intercept.
+
+Downloading data from Qiita
+-------------------------------
+
+First we will download the BIOM table and metadata from Qiita.
+
+.. code-block:: bash
+
+    wget -O data.zip "https://qiita.ucsd.edu/public_artifact_download/?artifact_id=44773"
+    wget -O metadata.zip "https://qiita.ucsd.edu/public_download/?data=sample_information&study_id=107"
+    unzip data.zip
+    unzip metadata.zip
+
+We then import the data into Python so we can use BIRDMAn.
+
+.. code-block:: python
+
+    import biom
+    import pandas as pd
+
+    table = biom.load_table("BIOM/94270/reference-hit.biom")
+    metadata = pd.read_csv(
+        "templates/11913_20191016-112545.txt",
+        sep="\t",
+        index_col=0
+    )
+
+Processing metadata
+-------------------
+
+We will now determine which subjects in the metadata have samples at both pre and post-deworm timepoints.
+
+.. code-block:: python
+
+    subj_is_paired = (
+        metadata
+        .groupby("host_subject_id")
+        .apply(lambda x: (x["time_point"].values == [1, 2]).all())
+    )
+    paired_subjs = subj_is_paired[subj_is_paired].index
+    paired_samps = metadata[metadata["host_subject_id"].isin(paired_subjs)].index
+    cols_to_keep = ["time_point", "host_subject_id"]
+    metadata_model = metadata.loc[paired_samps, cols_to_keep].dropna()
+    metadata_model["time_point"] = (
+        metadata_model["time_point"].map({1: "pre-deworm", 2: "post-deworm"})
+    )
+    metadata_model["host_subject_id"] = "S" + metadata["host_subject_id"].astype(str)
+    metadata_model.head()
+
+.. list-table::
+    :header-rows: 1
+    :stub-columns: 1
+
+    * - sample-name
+      - time_point
+      - host_subject_id
+    * - 11913.102
+      - pre-deworm
+      - S102
+    * - 11913.102AF
+      - post-deworm
+      - S102
+    * - 11913.1097
+      - pre-deworm
+      - S1097
+    * - 11913.1097AF
+      - post-deworm
+      - S1097
+    * - 11913.119
+      - pre-deworm
+      - S119
+
+Filtering feature table
+-----------------------
+
+This table has nearly 3000 features, most of which are likely lowly prevalent. We will first filter the table to only include the samples we previously described and then filter out features that are present in fewer than 5 samples.
+
+.. code-block:: python
+
+    raw_tbl_df = table.to_dataframe()
+    samps_to_keep = sorted(list(set(raw_tbl_df.columns).intersection(metadata_model.index)))
+    filt_tbl_df = raw_tbl_df.loc[:, samps_to_keep]
+    prev = filt_tbl_df.clip(upper=1).sum(axis=1)
+    filt_tbl_df = filt_tbl_df.loc[prev[prev >= 5].index, :]
+    filt_tbl = biom.table.Table(
+        filt_tbl_df.values,
+        sample_ids=filt_tbl_df.columns,
+        observation_ids=filt_tbl_df.index
+    )
+
+We now have a table of 269 features by 46 samples (23 subjects). This is a much more manageable size!
+
+Model specification
+-------------------
+
+For this custom model we want to specify that ``time_point`` is a fixed effect and ``host_subject_id`` is a random effect. We are keeping this model relatively simple but you can imagine a more complicated model with random slopes, specified covariance structures, etc. Our model can thus be written as follows:
+
+.. math::
+
+    y_{ij} &\sim \textrm{NB}(\mu_{ij},\phi_j)
+
+    \mu_{ij} &= n_i p_{ij}
+
+    \textrm{alr}^{-1}(p_i) &= x_i \beta + z_i u
+
+Where :math:`z_i` represents the mapping of sample :math:`i` to subject and :math:`u` represents the subject coefficient vector.
+
+We also specify the following priors:
+
+.. math::
+
+    \beta_j &\sim \textrm{Normal}(0, B_p), B_p \in \mathbb{R}_{>0}
+
+    \frac{1}{\phi_j} &\sim \textrm{Cauchy}(0, C_s), C_s \in
+        \mathbb{R}_{>0}
+
+    u_i &\sim \textrm{Normal}(0, u_p), u_p \in \mathbb{R}_{>0}
+
+
+Stan code
+---------
+
+We will save the below file to ``negative_binomial_re.stan`` so we can import and compile it in BIRDMAn.
+
+
+.. code-block:: stan
+
+    data {
+        int<lower=0> N;                     // number of sample IDs
+        int<lower=0> S;                     // number of groups (subjects)
+        int<lower=0> D;                     // number of dimensions
+        int<lower=0> p;                     // number of covariates
+        real depth[N];                      // sequencing depths of microbes
+        matrix[N, p] x;                     // covariate matrix
+        int<lower=1, upper=S> subj_ids[N];  // mapping of samples to subject IDs
+        int y[N, D];                        // observed microbe abundances
+        real<lower=0> B_p;                  // stdev for covariate Beta Normal prior
+        real<lower=0> phi_s;                // scale for dispersion Cauchy prior
+        real<lower=0> u_p;                  // stdev for subject intercept Normal prior
+    }
+
+    parameters {
+        matrix[p, D-1] beta;
+        vector<lower=0>[D] reciprocal_phi;
+        vector[S] subj_int;
+    }
+
+    transformed parameters {
+        matrix[N, D-1] lam;
+        matrix[N, D] lam_clr;
+        vector<lower=0>[D] phi;
+
+        for (i in 1:D){
+            phi[i] = 1. / reciprocal_phi[i];
+        }
+
+        lam = x*beta;  // N x D-1
+        for (n in 1:N){
+            lam[n] += subj_int[subj_ids[n]];
+        }
+        lam_clr = append_col(to_vector(rep_array(0, N)), lam);
+    }
+
+    model {
+        // setting priors ...
+        for (i in 1:D){
+            reciprocal_phi[i] ~ cauchy(0., phi_s);
+        }
+        for (i in 1:D-1){
+            for (j in 1:p){
+                beta[j, i] ~ normal(0., B_p); // uninformed prior
+            }
+        }
+        for (i in 1:S){
+            subj_int[i] ~ normal(0., u_p);
+        }
+
+        // generating counts
+        for (n in 1:N){
+            for (i in 1:D){
+                target += neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]);
+            }
+        }
+    }
+
+    generated quantities {
+        matrix[N, D] y_predict;
+        matrix[N, D] log_lik;
+
+        for (n in 1:N){
+            for (i in 1:D){
+                y_predict[n, i] = neg_binomial_2_log_rng(depth[n] + lam_clr[n, i], phi[i]);
+                log_lik[n, i] = neg_binomial_2_log_lpmf(y[n, i] | depth[n] + lam_clr[n, i], phi[i]);
+            }
+        }
+    }
+
+Running BIRDMAn
+---------------
+
+We will now pass this file along with our table, metadata, and formula into BIRDMAn. Note that we are using the base ``Model`` class for our custom model.
+
+.. code-block:: python
+
+    import birdman
+
+    nb_lme = birdman.Model(  # note that we are instantiating a base Model object
+        table=filt_tbl,
+        formula="C(time_point, Treatment('pre-deworm'))",
+        metadata=metadata_model.loc[samps_to_keep],
+        model_path="negative_binomial_re.stan",
+        num_iter=500,
+        chains=4,
+        seed=42
+    )
+
+We then want to update our data dictionary with the new parameters.
+
+By default BIRDMAn computes and includes:
+
+* ``y``: table data
+* ``x``: covariate design matrix
+* ``N``: number of samples
+* ``D``: number of features
+* ``p``: number of covariates (including Intercept)
+
+We want to add the necessary variables to be passed to Stan:
+
+* ``S``: total number of groups (subjects)
+* ``subj_ids``: mapping of samples to subject
+* ``B_p``: stdev prior for normally distributed covariate-feature coefficients
+* ``phi_s``: scale prior for half-Cauchy distributed overdispersion coefficients
+* ``depth``: log sampling depths of samples
+* ``u_p``: stdev prior for normally distributed subject intercept shifts
+
+We want to provide ``subj_ids`` with a mapping of which sample corresponds to which subject. Stan does not understand strings so we encode each unique subject as an integer (starting at 1 because Stan 1-indexes arrays).
+
+.. code-block:: python
+
+    import numpy as np
+
+    group_var_series = metadata_model.loc[samps_to_keep]["host_subject_id"]
+    samp_subj_map = group_var_series.astype("category").cat.codes + 1
+    groups = np.sort(group_var_series.unique())
+
+Now we can add all the necessary parameters to BIRDMAn with the ``add_parameters`` function.
+
+.. code-block:: python
+
+    param_dict = {
+        "S": len(groups),
+        "subj_ids": samp_subj_map.values,
+        "depth": np.log(filt_tbl.sum(axis="sample")),
+        "B_p": 3.0,
+        "phi_s": 3.0,
+        "u_p": 1.0
+    }
+    nb_lme.add_parameters(param_dict)
+
+Finally, we compile and fit the model.
+
+.. note::
+
+    Fitting this model took approximately 6 minutes on my laptop.
+
+.. code-block:: python
+
+    nb_lme.compile_model()
+    nb_lme.fit_model()
+
+Converting to ``InferenceData``
+-------------------------------
+
+With a custom model there is a bit more legwork involved in converting to the ``arviz.InferenceData`` data structure. We will step through each of the parameters in this example.
+
+* ``params``: List of parameters you want to include in the posterior draws (must match Stan code).
+* ``dims``: Dictionary of dimensions of each parameter to include. Note that we also include the names of the variables for log likelihood and posterior predictive values, ``log_lik`` and ``y_predict`` respectively.
+* ``coords``: Mapping of dimensions in ``dims`` to their indices. We internally save ``feature_names``, ``sample_names``, and ``colnames`` (names of covariates in design matrix).
+* ``alr_params``: List of parameters which come out of Stan in ALR coordinates. These will be converted into centered CLR coordinates before being returned.
+* ``posterior_predictive``: Name of variable holding posterior predictive values (optional).
+* ``log_likelihood``: Name of variable holding log likelihood values (optional).
+* ``include_observed_data``: Whether to include the original feature table as a group. This is useful for certain diagnostics.
+
+.. code-block:: python
+
+    inference = nb_lme.to_inference_object(
+        params=["beta", "phi", "subj_int"],
+        dims={
+            "beta": ["covariate", "feature"],
+            "phi": ["feature"],
+            "subj_int": ["subject"],
+            "log_lik": ["tbl_sample", "feature"],
+            "y_predict": ["tbl_sample", "feature"]
+        },
+        coords={
+            "feature": nb_lme.feature_names,
+            "covariate": nb_lme.colnames,
+            "subject": groups,
+            "tbl_sample": nb_lme.sample_names
+        },
+        alr_params=["beta"],
+        posterior_predictive="y_predict",
+        log_likelihood="log_lik",
+        include_observed_data=True
+    )
+
+With this you can use the rest of the BIRDMAn suite as usual or directly interact with the ``arviz`` library!
diff --git a/docs/diagnosing_model.rst b/docs/diagnosing_model.rst
new file mode 100644
index 0000000..6efcea0
--- /dev/null
+++ b/docs/diagnosing_model.rst
@@ -0,0 +1,59 @@
+Diagnosing BIRDMAn models
+=========================
+
+One of the most important things to do when running BIRDMAn is to diagnose your model to make sure that it is useful for your downstream analysis. There are a lot of ways to do this and a ton of literature/tools about diagnosing Bayesian models.
+
+BIRDMAn wraps many function present in the ``arviz`` package. We recommend reading through their documentation for more information. You can use the ``to_inference_object`` method in any BIRDMAn object to convert your model into a format that is compatible with ``arviz``. We wrap a few of these functions for common diagnostics.
+
+Rhat
+----
+
+The ``Rhat`` (:math:`\hat{R}`) statistic is a measure of chain convergence in MCMC methods. In a nutshell this metric describes the concordance between within-chain and between-chain parameter estimates. If these values are very similar, ``Rhat`` will be close to 1. You should check the ``Rhat`` values of your model to ensure convergence.
+
+.. code-block:: python
+
+    from birdman.diagnostics import rhat
+    rhat(inference)
+
+PSIS-LOO-CV
+-----------
+
+Pareto-smoothed importance sampling leave-one-out cross-validation (PSIS-LOO-CV, referred to informally as LOO) estimates the (log) pointwise predictive density. This can be thought of as an estimation of traditional leave-one-out cross validation, which is often very computationally expensive for Bayesian models. To calculate this metric, one must generate posterior predictive values in the Stan code and specify this variable when calling ``to_inference_object``.
+
+The expected log pointwise predictive density ``elpd`` is the sum of the posterior draws log likelihood values. The objective is to maximize this value (closer to 0). However, it is important to note that this is more useful as a relative measure. All other things being equal, a model with a higher ``elpd`` better predicts feature counts.
+
+An important note here is the ability to calculate the *pointwise* predictive accuracy (at each data point). If you provide the ``pointwise=True`` option, you will also get the Pareto k diagnostic values. This metric assesses how well the PSIS-esimation converges. These values should be mostly less than 0.7. If you have many values above 0.7, it may indicate problems with the underlying model.
+
+.. code-block:: python
+
+    from birdman.diagnostics import loo
+    loo(inference, pointwise=True)
+
+Posterior predictive check
+--------------------------
+
+Posterior predictive checking (PPC) is the process of using the simulated posterior draws to predict the original values. If the model is fit well, you should be able to predict the counts in your table to a reasonable degree of accuracy. Note that for sequencing count data it is very difficult to predict values with small credible intervals. We recommend checking that the general shape of the predicted values matches the shape of the actual values.
+
+We provide a function to plot posterior predictive checks if they have been calculated and included in the ``InferenceData`` object. This function returns a ``matplotlib`` figure where the black line represents the original values, the dark gray points represent the median predicted values across all chains and draws, and the light gray vertical lines represent the 95% credible interval of the value.
+
+.. code-block:: python
+
+    from birdman.visualization import plot_posterior_predictive_checks
+    plot_posterior_predictive_checks(inference)
+
+.. image:: imgs/ppc_plot.png
+
+
+Comparing to null model
+-----------------------
+
+One way to determine whether or not your model is learning anything biologically useful is to compare to a "null" model (with no covariates). If you find that your model does not have more predictive power than random chance, it may be an indication that you should rethink your model construction.
+
+This can be done fairly easily with ``arviz.compare``. This function takes as input a dictionary where the ``key: value`` pairs are the model name and the associated inference object respectively. The result will be a table of the models ranked in *decreasing* order of "goodness". Note that models must include ``log_likelihood`` groups for use in ``arviz.compare``.
+
+.. code-block:: python
+
+    import arviz as az
+    # model_1 = InferenceData from model with covariates included
+    # model_0 = InferenceData from model with formula = "1"
+    az.compare({"null": model_0, "regression": model_1})
diff --git a/docs/imgs/ppc_plot.png b/docs/imgs/ppc_plot.png
new file mode 100644
index 0000000..e8c7410
Binary files /dev/null and b/docs/imgs/ppc_plot.png differ
diff --git a/docs/index.rst b/docs/index.rst
index 7d5dea0..369ada4 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -21,6 +21,7 @@ The easiest way to install the required dependencies is through ``conda``.
 
     conda install -c conda-forge dask biom-format patsy xarray
     git clone git@github.com:gibsramen/BIRDMAn.git
+    pip install git+git://github.com/arviz-devs/arviz.git
     cd BIRDMAn
     pip install -e .
 
@@ -45,10 +46,19 @@ In Python:
 
 .. toctree::
     :maxdepth: 2
-    :caption: Tutorial
+    :caption: User Guide
 
     default_model_example
-    user_defined_model
+    custom_model
+    diagnosing_model
+    writing_stan_code
+    working_with_arviz
+
+.. toctree::
+    :maxdepth: 2
+    :caption: How BIRDMAn Works
+
+    bayesian_inference
 
 .. toctree::
     :maxdepth: 2
@@ -58,6 +68,7 @@ In Python:
     diagnostics
     stats
     util
+    visualization
 
 Indices and tables
 ------------------
diff --git a/docs/models.rst b/docs/models.rst
index 9b3977f..15d9841 100644
--- a/docs/models.rst
+++ b/docs/models.rst
@@ -14,5 +14,8 @@ Default Models
 --------------
 
 .. autoclass:: birdman.default_models.NegativeBinomial
+    :members:
 .. autoclass:: birdman.default_models.NegativeBinomialLME
+    :members:
 .. autoclass:: birdman.default_models.Multinomial
+    :members:
diff --git a/docs/user_defined_model.rst b/docs/user_defined_model.rst
deleted file mode 100644
index 1537045..0000000
--- a/docs/user_defined_model.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Defining your own model
-=======================
diff --git a/docs/visualization.rst b/docs/visualization.rst
new file mode 100644
index 0000000..b4a563a
--- /dev/null
+++ b/docs/visualization.rst
@@ -0,0 +1,6 @@
+Visualization Functions
+=======================
+
+.. automodule:: birdman.visualization
+    :members:
+
diff --git a/docs/working_with_arviz.rst b/docs/working_with_arviz.rst
new file mode 100644
index 0000000..bd93173
--- /dev/null
+++ b/docs/working_with_arviz.rst
@@ -0,0 +1,128 @@
+Working with ``arviz``
+======================
+
+``arviz`` is a wonderful library for analysis of Bayesian models. It includes many functions to help researchers determine how well a model performs both visually and numerically. However, those coming from primarily ``pandas`` and ``numpy`` experience may find it a bit challenging to orient themselves to using ``arviz`` and its underlying data structure derived from the ``xarray`` package. Here, we give a short breakdown on how to start analyzing an ``arviz.InferenceData`` object.
+
+For more information on these two packages, see their documentation:
+
+* `arviz <https://arviz-devs.github.io/arviz/index.html>`_
+* `xarray <http://xarray.pydata.org/en/stable/>`_
+
+``xarray``
+----------
+
+An ``arviz.InferenceData`` is essentially a collection of ``xarray`` objects so we will cover ``xarray`` first. ``xarray`` is a Python package designed for efficient and elegant interaction of multi-dimensional data. In Bayesian analysis we deal primarily with data that includes both chain and draw as dimensions in addition to the original parameter dimensions. For example, in a ``birdman.NegativeBinomial`` model, the :math:`\beta` parameter will be output with 4 dimensions: chain, draw, covariate, and feature. Dealing with all of these dimensions in simple NumPy arrays can get confusing as you try to keep track of which dimension is which. ``xarray`` uses named dimensions and coordinates to make this process much cleaner and intuitive.
+
+``xarray.Dataset``
+^^^^^^^^^^^^^^^^^^
+
+The ``Dataset`` is the primary data structure that you will be interfacing with in ``xarray``. A ``Dataset`` is a collection of data variables (for example parameters from posterior sampling) that can have different dimensionality. As an example, here is a ``Dataset`` holding posterior draws from a ``birdman.NegativeBinomial`` model:
+
+.. code-block::
+
+    <xarray.Dataset>
+    Dimensions:    (chain: 4, covariate: 2, draw: 100, feature: 143)
+    Coordinates:
+    * chain      (chain) int64 0 1 2 3
+    * draw       (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 91 92 93 94 95 96 97 98 99
+    * feature    (feature) object '193358' '4465746' ... '212228' '192368'
+    * covariate  (covariate) object 'Intercept' 'diet[T.DIO]'
+    Data variables:
+        beta       (chain, draw, covariate, feature) float64 7.142 3.549 ... 0.4587
+        phi        (chain, draw, feature) float64 0.1216 0.3286 ... 0.8205 0.6151
+    Attributes:
+        created_at:                 2021-04-01T22:37:03.663497
+        arviz_version:              0.11.2
+        inference_library:          cmdstanpy
+        inference_library_version:  0.9.68
+
+The ``Dimensions`` descriptor shows the names and number of entries in each dimensions. The ``Coordinates`` entry holds the labels for each of the dimensions. In this example we see that the chains are labeled 0-3 and the draws are labeled 0-99. However, the features are labeled with OTU IDs and the covariates are labeled with the entries in the design matrix. The ``Data variables`` entry contains the actual data (in this case parameters) and lists the dimensionality of each. Note that ``beta`` is of dimension chain, draw, covariate, feature while ``phi`` is only of dimension chain, draw, feature. The ``Dataset`` is a powerful data structure because it can hold data variables of varying dimensionality.
+
+``xarray.DataArray``
+^^^^^^^^^^^^^^^^^^^^
+
+A ``Dataset`` is simply a collection of ``DataArray`` objects. Whereas a ``Dataset`` can contain multiple data variables, a ``DataArray`` contains only one. If you want to access the ``beta`` variable from the above ``Dataset``, you simply index it like you would a dictionary. If you have a ``Dataset``, ``ds``, with a data variable ``beta``, you would access it with ``ds["beta"]`` which returns:
+
+.. code-block::
+
+    <xarray.DataArray 'beta' (chain: 4, draw: 100, covariate: 2, feature: 143)>
+    array([[[[ 7.14216 , ..., -0.03673 ],
+            [-0.639199, ...,  0.958811]],
+
+            ...,
+
+            [[ 7.071049, ..., -0.374691],
+            [-0.61982 , ...,  0.5009  ]]],
+
+
+            ...,
+
+
+            [[[ 7.096262, ..., -0.281968],
+            [-0.607823, ...,  1.190807]],
+
+            ...,
+
+            [[ 7.185024, ...,  0.038614],
+            [-0.671318, ...,  0.458722]]]])
+    Coordinates:
+    * chain      (chain) int64 0 1 2 3
+    * draw       (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 91 92 93 94 95 96 97 98 99
+    * feature    (feature) object '193358' '4465746' ... '212228' '192368'
+    * covariate  (covariate) object 'Intercept' 'diet[T.DIO]'
+
+Selecting and indexing data
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Manipulating data in ``xarray`` is a bit more involved than in NumPy. The most important thing to keep in mind is the notion of ``dims`` (dimensions) and ``coords`` (coordinates). Dimensions are the *names* while coordinates are the *labels*.
+
+You can use the ``.sel`` function to select specific slices of data. To extract all values from just chain 0, you would run:
+
+.. code-block:: python
+
+    ds["beta"].sel(chain=0)
+
+You can also index across multiple dimensions - if you wanted only values from chain 2 from the diet covariate you would run:
+
+.. code-block:: python
+
+    ds["beta"].sel(chain=2, covariate="diet[T.DIO]")
+
+This also works with multiple values for a given dimension. As an example if you wanted to get all diet posterior samples from just features 193358 and 4465746 you would run:
+
+.. code-block:: python
+
+    ds["beta"].sel(feature=["193358", "4465746"], covariate="diet[T.DIO]")
+
+See the `documentation <http://xarray.pydata.org/en/stable/indexing.html>`_ for more on indexing and selecting data.
+
+``arviz``
+---------
+
+An ``arviz.InferenceData`` object is a collection of ``xarray.Datasets`` organized for use in Bayesian model analysis. Each inference comprises several groups such as posterior draws, sample stats, log likelihood values, etc. ``arviz`` organizes these different groups such that they can be used seamlessly for downstream analysis.
+
+If you run a ``birdman.NegativeBinomial`` model and convert it to an inference object, you can print this object and see the following:
+
+.. code-block:: bash
+
+    Inference data with groups:
+            > posterior
+            > posterior_predictive
+            > log_likelihood
+            > sample_stats
+            > observed_data
+
+Each group is an ``xarray.Dataset`` that you can interact with as described above. You can access each of these groups with either attribute notation (``inference.posterior``) or index notation (``inference["posterior"]``).
+
+Saving and loading data
+^^^^^^^^^^^^^^^^^^^^^^^
+
+It is useful to be able to save the results of BIRDMAn so that they can be analyzed later or distributed to collaborators. The best way to do this is to save the ``InferenceData`` object in the `NetCDF <https://www.unidata.ucar.edu/software/netcdf/>`_ format. This is a compressed format that works very well with multi-dimensional arrays.
+
+You can save and load fitted models with ``to_netcdf`` and ``from_netcdf``.
+
+.. code-block:: python
+
+    import arviz as az
+    inference.to_netcdf("inference.nc")
+    inference_loaded = az.from_netcdf("inference.nc")
diff --git a/docs/writing_stan_code.rst b/docs/writing_stan_code.rst
new file mode 100644
index 0000000..086e79c
--- /dev/null
+++ b/docs/writing_stan_code.rst
@@ -0,0 +1,50 @@
+Writing Stan code
+=================
+
+The core of BIRDMAn is running compiled Stan code that defines your model. While we do provide several default options, one of the key strengths of BIRDMAn is its extensibility to custom differential abundance modeling. This page gives an overview of how to write custom Stan code for BIRDMAn modeling.
+
+See the `Stan documentation <https://mc-stan.org/docs/2_26/stan-users-guide/index.html>`_ for more details.
+
+Structure of a Stan program
+---------------------------
+
+Stan programs are made up of several "blocks". The main ones we use in BIRDMAn are:
+
+* ``data``: This block defines the types and dimensions of the data used in the program. Every variable present in this block must be passed into the Stan program from Python.
+* ``parameters``: This block defines the types and dimensions of the parameters you are interested in fitting.
+* ``transformed parameters``: This block defines any transformations to perform on the data. For example we use this block to multiply the design matrix by the feature-covariate coefficient matrix.
+* ``model``: This block defines the setting of any priors and the fitting of your target distribution (negative binomial, multinomial, etc.).
+* ``generated quantities``: This block is for generating any other results such as posterior predictive values and log likelihood values.
+
+Data
+----
+
+Every Stan program you use with BIRDMAn passes the following data by default into the data block:
+
+* ``N``: Number of samples
+* ``D``: Number of features
+* ``x``: Design matrix
+* ``p``: Number of columns in the design matrix (computed from formula)
+* ``y``: Feature table values
+
+Any other parameters must be added to the BIRDMAn model with the ``add_parameters`` method (unless using a default model).
+
+Parameters
+----------
+
+In this block you should specify the parameters you are primarily interested in fitting. This includes coefficient matrices, random intercepts, etc. Important to note that if you are fitting feature coefficients you may need to use ALR coordinates. For example, in the ``NegativeBinomial`` default model we specify ``matrix[p, D-1] beta;`` as ALR coordinates remove one feature as reference.
+
+Transformed parameters
+----------------------
+
+The ``transformed parameters`` block is used for defining operations on the parameters more advanced than just defining them. In the default models we use it to wrangle :math:`x\beta` into a format that's easier to use in the ``model`` block.
+
+Model
+-----
+
+This is where the meat of the computation lies. The model block is where you assign priors and compute likelihood. The parameters and data should all come together in this block to sample from the posterior for your regression.
+
+Generated quantities
+--------------------
+
+This block is primarily for calculations in addition to HMC sampling. In BIRDMAn default models we perform posterior predictive and log likelihood calculations in this block from the draws generated in the model block.
diff --git a/setup.py b/setup.py
index 28ee82f..73113c1 100644
--- a/setup.py
+++ b/setup.py
@@ -2,10 +2,28 @@
 
 __version__ = "0.0.1"
 
+# Inspired by the EMPress classifiers
+classifiers = """
+    Development Status :: 2 - Pre-Alpha
+    License :: OSI Approved :: BSD License
+    Programming Language :: Python :: 3.7
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
+    Intended Audience :: Science/Research
+    Operating System :: POSIX
+    Operating System :: MacOS :: MacOS X
+    Topic :: Scientific/Engineering :: Bio-Informatics
+"""
+short_desc = "Framework for differential abundance using Bayesian inference"
+
 setup(
     name="birdman",
     author="Gibraan Rahman",
+    author_email="grahman@eng.ucsd.edu",
+    description=short_desc,
+    url="https://github.com/gibsramen/BIRDMAn",
     version=__version__,
+    license='BSD-3-Clause',
     packages=find_packages(),
     include_package_data=True,
     package_data={"": ["*.stan"]},
@@ -16,9 +34,9 @@
         "biom-format",
         "patsy",
         "xarray",
-        "arviz @ git+git://github.com/arviz-devs/arviz.git"
-        # Temporary solution until github.com/arviz-devs/arviz/pull/1579
-        # is included in on PyPI
+        "pandas",
+        "arviz"
     ],
-    extras_require={"dev": ["pytest", "scikit-bio"]}
+    extras_require={"dev": ["pytest", "scikit-bio", "sphinx"]},
+    classifiers=classifiers
 )
diff --git a/tests/test_visualization.py b/tests/test_visualization.py
index 0f23100..a1b5219 100644
--- a/tests/test_visualization.py
+++ b/tests/test_visualization.py
@@ -8,7 +8,7 @@ def test_rank_plot_beta(self, example_inf):
         viz.plot_parameter_estimates(
             inference_object=example_inf,
             parameter="beta",
-            coord={"covariate": "host_common_name[T.long-tailed macaque]"},
+            coords={"covariate": "host_common_name[T.long-tailed macaque]"},
             num_std=1
         )
 
@@ -19,7 +19,7 @@ def test_rank_plot_phi(self, example_inf):
             num_std=1
         )
 
-    def test_rank_plot_no_coord(self, example_inf):
+    def test_rank_plot_no_coords(self, example_inf):
         with pytest.raises(ValueError) as excinfo:
             viz.plot_parameter_estimates(
                 inference_object=example_inf,