qiime2 · mortonjt · Aug 29, 2020 · Aug 29, 2020 · Aug 29, 2020 · Aug 29, 2020
diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml
@@ -26,6 +26,7 @@ requirements:
     - numpy >=1.15.3
     - statsmodels >=0.9.0  # TODO: fix this upstream
     - pandas
+    - xarray
     - seaborn
     - bokeh
     - biom-format >=2.1.5,<2.2.0

diff --git a/q2_gneiss/composition/_composition.py b/q2_gneiss/composition/_composition.py
@@ -11,13 +11,14 @@
                                     Balance, Composition)
 from q2_types.feature_data import FeatureData, Differential
 from q2_types.ordination import PCoAResults
-
+from q2_differential._type import FeatureTensor
 from q2_gneiss.composition._method import (
     ilr_hierarchical, ilr_phylogenetic,
+    ilr_phylogenetic_posterior_differential,
     ilr_phylogenetic_differential,
     ilr_phylogenetic_ordination
 )
-from qiime2.plugin import Float, Int, List, Str
+from qiime2.plugin import Float, Int, List, Str, Bool
 
 
 plugin.methods.register_function(
@@ -79,16 +80,19 @@
 
 
 plugin.methods.register_function(
-    function=ilr_phylogenetic_differential,
-    inputs={'differential': FeatureData[Differential],
+    function=ilr_phylogenetic_posterior_differential,
+    inputs={'posterior': FeatureTensor,
             'tree': Phylogeny[Rooted]},
-    outputs=[('ilr_differential', FeatureData[Differential]),
+    outputs=[('balances', FeatureTensor),
+             ('clade_metadata', FeatureData[Differential]),
              ('bifurcated_tree', Phylogeny[Rooted])],
-    name=('Differentially abundant Phylogenetic Log Ratios.'),
-    parameters={},
+    name=('Bayesian differentially abundant Phylogenetic Log Ratios.'),
+    parameters={
+        'minimax_filter': Bool
+    },
     input_descriptions={
-        'differential': (
-            'The differential abundance results in which '
+        'posterior': (
+            'The posterior differential abundance results in which '
             'will be ilr transformed.'),
         'tree': ('A rooted phylogeny of feature identifiers that defines '
                  'the partitions of features.  Each tip in the hierarchy'
@@ -100,9 +104,15 @@
                  'contain polytomic nodes (i.e., nodes with more than '
                  'two children), in which case they will be bifurcated.')
     },
-    parameter_descriptions={},
+    parameter_descriptions={
+        'minimax_filter': ('If specified, only the clades that have a '
+                           'chance of being the most increased or decreased '
+                           'balance are outputted. If not specified all '
+                           'significant clades are outputted.')
+    },
     output_descriptions={
-        'ilr_differential': 'Per clade differential abundance results.',
+        'balances': 'Per clade differential abundance results.',
+        'clade_metadata': 'Metadata specifying clade membership.',
         'bifurcated_tree': 'Bifurcating phylogeny.'
     },
     description=("Compute an ILR transform of differentials "
@@ -147,3 +157,34 @@
     },
     description="Compute an ILR ordination given a rooted phylogeny."
 )
+
+plugin.methods.register_function(
+    function=ilr_phylogenetic_differential,
+    inputs={'differential': FeatureData[Differential],
+            'tree': Phylogeny[Rooted]},
+    outputs=[('ilr_differential', FeatureData[Differential]),
+             ('bifurcated_tree', Phylogeny[Rooted])],
+    name=('Differentially abundant Phylogenetic Log Ratios.'),
+    parameters={},
+    input_descriptions={
+        'differential': (
+            'The differential abundance results in which '
+            'will be ilr transformed.'),
+        'tree': ('A rooted phylogeny of feature identifiers that defines '
+                 'the partitions of features.  Each tip in the hierarchy'
+                 'corresponds to the feature identifiers in the table. '
+                 'This tree can contain tip ids that are not present in '
+                 'the table, but all feature ids in the table must be '
+                 'present in this tree.  This assumes that all of the '
+                 'internal nodes in the tree have labels. This tree may '
+                 'contain polytomic nodes (i.e., nodes with more than '
+                 'two children), in which case they will be bifurcated.')
+    },
+    parameter_descriptions={},
+    output_descriptions={
+        'ilr_differential': 'Per clade differential abundance results.',
+        'bifurcated_tree': 'Bifurcating phylogeny.'
+    },
+    description=("Compute an ILR transform of differentials "
+                 "given a rooted phylogeny.")
+)
diff --git a/q2_gneiss/composition/_method.py b/q2_gneiss/composition/_method.py
@@ -9,13 +9,15 @@
 import pandas as pd
 import skbio
 from gneiss.composition import ilr_transform
+from gneiss.balances import sparse_balance_basis
 from gneiss.util import match_tips
-from gneiss.util import rename_internal_nodes
+from gneiss.util import rename_internal_nodes, rename_clades
+from gneiss.util import _xarray_match_tips
 from gneiss.util import NUMERATOR, DENOMINATOR
-
-from q2_gneiss._util import add_pseudocount
 from gneiss.balances import _balance_basis
+from q2_gneiss._util import add_pseudocount
 from skbio import OrdinationResults
+import xarray as xr
 
 
 def ilr_hierarchical(table: pd.DataFrame, tree: skbio.TreeNode,
@@ -48,6 +50,51 @@ def ilr_phylogenetic_differential(
     return diff_balances, t
 
 
+def ilr_phylogenetic_posterior_differential(
+        posterior: xr.Dataset, tree: skbio.TreeNode,
+        minimax_filter: bool = True
+) -> (xr.DataArray, pd.DataFrame, skbio.TreeNode):
+    dataset, trimmed_tree = _xarray_match_tips(posterior, tree, 'features')
+    trimmed_tree = rename_clades(trimmed_tree)
+    # TODO: watch out for this indexing with diff. The ordering of the dims
+    # could mess things up here.
+    posterior_data = np.array(dataset['diff'].values)
+    basis, nodes = sparse_balance_basis(trimmed_tree)  # D-1 x D
+    ilr_tensor = basis @ posterior_data
+    balances = xr.DataArray(
+        ilr_tensor, coords={'balances': nodes},
+        dims=['balances', 'mc_samples'])
+    mc_samples = ilr_tensor.shape[1]
+    # fetches the most significant clades
+    # pull out log-odds ranks
+    bs = pd.DataFrame(ilr_tensor, index=nodes)
+    if minimax_filter:
+        max_taxa = list(set(bs.idxmax().value_counts().index))
+        min_taxa = list(set(bs.idxmin().value_counts().index))
+        clades = min_taxa + max_taxa
+    else:
+        pos_sig = (bs > 0).sum(axis=1) == mc_samples
+        neg_sig = (bs < 0).sum(axis=1) == mc_samples
+        pos_clades = list(pos_sig.loc[pos_sig].index)
+        neg_clades = list(neg_sig.loc[neg_sig].index)
+        clades = pos_clades + neg_clades
+    dense_basis = {}
+    # create dense basis
+    features = list(dataset['features'].values)
+    for c in clades:
+        subtree = trimmed_tree.find(c)
+        num_tips = get_children(subtree, NUMERATOR)
+        denom_tips = get_children(subtree, DENOMINATOR)
+        r, s = len(num_tips),  len(denom_tips)
+        dense_basis[c] = pd.Series(np.zeros(len(features)),
+                                   index=features)
+        dense_basis[c].loc[num_tips] = np.sqrt(s / (r * (r + s)))
+        dense_basis[c].loc[denom_tips] = -np.sqrt(s / (s * (r + s)))
+    clade_metadata = pd.DataFrame(dense_basis)
+    clade_metadata.index.name = 'featureid'
+    return balances, clade_metadata, trimmed_tree
+
+
 def get_children(tree, side):
     if tree.children[side].is_tip():
         return [tree.children[side].name]
@@ -77,7 +124,6 @@ def _fast_ilr(tree, table, clades, pseudocount=0.5):
 
         num = logmean(table, num_tips, pseudocount)
         denom = logmean(table, denom_tips, pseudocount)
-        print(num.shape, denom.shape)
         balances[c] = pd.Series(Z * (num - denom), index=table.index)
         basis[c] = pd.Series(np.zeros(len(table.columns)), index=table.columns)
         basis[c].loc[num_tips] = np.sqrt(s / (r * (r + s)))

diff --git a/q2_gneiss/composition/tests/test_method.py b/q2_gneiss/composition/tests/test_method.py
@@ -13,7 +13,8 @@
 import pandas.testing as pdt
 
 from q2_gneiss.composition._method import (
-    ilr_hierarchical, ilr_phylogenetic, ilr_phylogenetic_ordination
+    ilr_hierarchical, ilr_phylogenetic, ilr_phylogenetic_ordination,
+    ilr_phylogenetic_posterior_differential,
 )
 from q2_gneiss._util import add_pseudocount
 
@@ -153,6 +154,25 @@ def test_ilr_ordination(self):
         exp_md.index.name = 'featureid'
         pdt.assert_frame_equal(res_md, exp_md)
 
+    def test_ilr_tensor(self):
+        import xarray as xr
+        data = np.array(
+            [[0, 0, 1, 1],
+             [0, 0, 1, 2],
+             [0, 0, 1, 3]]
+        ).T
+        # TODO: the order of the dims really matters here
+        dataset = xr.DataArray(
+            data,
+            dims=['features', 'monte_carlo_samples'],
+            coords=[['a', 'b', 'c', 'd'], [0, 1, 2]]
+        )
+        dataset.name = 'diff'
+        dataset = dataset.to_dataset()
+        tree = TreeNode.read([u"(((b,a)f, c),d)r;"])
+        ilr_phylogenetic_posterior_differential(dataset, tree, True)
+        ilr_phylogenetic_posterior_differential(dataset, tree, False)
+
 
 if __name__ == '__main__':
     unittest.main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,6 +26,7 @@ requirements: @@
         - numpy >=1.15.3
         - statsmodels >=0.9.0  # TODO: fix this upstream
         - pandas
+        - xarray
         - seaborn
         - bokeh
         - biom-format >=2.1.5,<2.2.0
@@ Expand Down @@