Skip to content
This repository has been archived by the owner on Dec 29, 2023. It is now read-only.

WIP: ILR transform on tensors #75

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci/recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ requirements:
- numpy >=1.15.3
- statsmodels >=0.9.0 # TODO: fix this upstream
- pandas
- xarray
- seaborn
- bokeh
- biom-format >=2.1.5,<2.2.0
Expand Down
63 changes: 52 additions & 11 deletions q2_gneiss/composition/_composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@
Balance, Composition)
from q2_types.feature_data import FeatureData, Differential
from q2_types.ordination import PCoAResults

from q2_differential._type import FeatureTensor
from q2_gneiss.composition._method import (
ilr_hierarchical, ilr_phylogenetic,
ilr_phylogenetic_posterior_differential,
ilr_phylogenetic_differential,
ilr_phylogenetic_ordination
)
from qiime2.plugin import Float, Int, List, Str
from qiime2.plugin import Float, Int, List, Str, Bool


plugin.methods.register_function(
Expand Down Expand Up @@ -79,16 +80,19 @@


plugin.methods.register_function(
function=ilr_phylogenetic_differential,
inputs={'differential': FeatureData[Differential],
function=ilr_phylogenetic_posterior_differential,
inputs={'posterior': FeatureTensor,
'tree': Phylogeny[Rooted]},
outputs=[('ilr_differential', FeatureData[Differential]),
outputs=[('balances', FeatureTensor),
('clade_metadata', FeatureData[Differential]),
('bifurcated_tree', Phylogeny[Rooted])],
name=('Differentially abundant Phylogenetic Log Ratios.'),
parameters={},
name=('Bayesian differentially abundant Phylogenetic Log Ratios.'),
parameters={
'minimax_filter': Bool
},
input_descriptions={
'differential': (
'The differential abundance results in which '
'posterior': (
'The posterior differential abundance results in which '
'will be ilr transformed.'),
'tree': ('A rooted phylogeny of feature identifiers that defines '
'the partitions of features. Each tip in the hierarchy'
Expand All @@ -100,9 +104,15 @@
'contain polytomic nodes (i.e., nodes with more than '
'two children), in which case they will be bifurcated.')
},
parameter_descriptions={},
parameter_descriptions={
'minimax_filter': ('If specified, only the clades that have a '
'chance of being the most increased or decreased '
'balance are outputted. If not specified all '
'significant clades are outputted.')
},
output_descriptions={
'ilr_differential': 'Per clade differential abundance results.',
'balances': 'Per clade differential abundance results.',
'clade_metadata': 'Metadata specifying clade membership.',
'bifurcated_tree': 'Bifurcating phylogeny.'
},
description=("Compute an ILR transform of differentials "
Expand Down Expand Up @@ -147,3 +157,34 @@
},
description="Compute an ILR ordination given a rooted phylogeny."
)

plugin.methods.register_function(
function=ilr_phylogenetic_differential,
inputs={'differential': FeatureData[Differential],
'tree': Phylogeny[Rooted]},
outputs=[('ilr_differential', FeatureData[Differential]),
('bifurcated_tree', Phylogeny[Rooted])],
name=('Differentially abundant Phylogenetic Log Ratios.'),
parameters={},
input_descriptions={
'differential': (
'The differential abundance results in which '
'will be ilr transformed.'),
'tree': ('A rooted phylogeny of feature identifiers that defines '
'the partitions of features. Each tip in the hierarchy'
'corresponds to the feature identifiers in the table. '
'This tree can contain tip ids that are not present in '
'the table, but all feature ids in the table must be '
'present in this tree. This assumes that all of the '
'internal nodes in the tree have labels. This tree may '
'contain polytomic nodes (i.e., nodes with more than '
'two children), in which case they will be bifurcated.')
},
parameter_descriptions={},
output_descriptions={
'ilr_differential': 'Per clade differential abundance results.',
'bifurcated_tree': 'Bifurcating phylogeny.'
},
description=("Compute an ILR transform of differentials "
"given a rooted phylogeny.")
)
54 changes: 50 additions & 4 deletions q2_gneiss/composition/_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
import pandas as pd
import skbio
from gneiss.composition import ilr_transform
from gneiss.balances import sparse_balance_basis
from gneiss.util import match_tips
from gneiss.util import rename_internal_nodes
from gneiss.util import rename_internal_nodes, rename_clades
from gneiss.util import _xarray_match_tips
from gneiss.util import NUMERATOR, DENOMINATOR

from q2_gneiss._util import add_pseudocount
from gneiss.balances import _balance_basis
from q2_gneiss._util import add_pseudocount
from skbio import OrdinationResults
import xarray as xr


def ilr_hierarchical(table: pd.DataFrame, tree: skbio.TreeNode,
Expand Down Expand Up @@ -48,6 +50,51 @@ def ilr_phylogenetic_differential(
return diff_balances, t


def ilr_phylogenetic_posterior_differential(
posterior: xr.Dataset, tree: skbio.TreeNode,
minimax_filter: bool = True
) -> (xr.DataArray, pd.DataFrame, skbio.TreeNode):
dataset, trimmed_tree = _xarray_match_tips(posterior, tree, 'features')
trimmed_tree = rename_clades(trimmed_tree)
# TODO: watch out for this indexing with diff. The ordering of the dims
# could mess things up here.
posterior_data = np.array(dataset['diff'].values)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should replace this with a pandas pivot so that the dimension order doesn't matter.

basis, nodes = sparse_balance_basis(trimmed_tree) # D-1 x D
ilr_tensor = basis @ posterior_data
balances = xr.DataArray(
ilr_tensor, coords={'balances': nodes},
dims=['balances', 'mc_samples'])
mc_samples = ilr_tensor.shape[1]
# fetches the most significant clades
# pull out log-odds ranks
bs = pd.DataFrame(ilr_tensor, index=nodes)
if minimax_filter:
max_taxa = list(set(bs.idxmax().value_counts().index))
min_taxa = list(set(bs.idxmin().value_counts().index))
clades = min_taxa + max_taxa
else:
pos_sig = (bs > 0).sum(axis=1) == mc_samples
neg_sig = (bs < 0).sum(axis=1) == mc_samples
pos_clades = list(pos_sig.loc[pos_sig].index)
neg_clades = list(neg_sig.loc[neg_sig].index)
clades = pos_clades + neg_clades
dense_basis = {}
# create dense basis
features = list(dataset['features'].values)
for c in clades:
subtree = trimmed_tree.find(c)
num_tips = get_children(subtree, NUMERATOR)
denom_tips = get_children(subtree, DENOMINATOR)
r, s = len(num_tips), len(denom_tips)
dense_basis[c] = pd.Series(np.zeros(len(features)),
index=features)
dense_basis[c].loc[num_tips] = np.sqrt(s / (r * (r + s)))
dense_basis[c].loc[denom_tips] = -np.sqrt(s / (s * (r + s)))
clade_metadata = pd.DataFrame(dense_basis)
clade_metadata.index.name = 'featureid'
return balances, clade_metadata, trimmed_tree


def get_children(tree, side):
if tree.children[side].is_tip():
return [tree.children[side].name]
Expand Down Expand Up @@ -77,7 +124,6 @@ def _fast_ilr(tree, table, clades, pseudocount=0.5):

num = logmean(table, num_tips, pseudocount)
denom = logmean(table, denom_tips, pseudocount)
print(num.shape, denom.shape)
balances[c] = pd.Series(Z * (num - denom), index=table.index)
basis[c] = pd.Series(np.zeros(len(table.columns)), index=table.columns)
basis[c].loc[num_tips] = np.sqrt(s / (r * (r + s)))
Expand Down
22 changes: 21 additions & 1 deletion q2_gneiss/composition/tests/test_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
import pandas.testing as pdt

from q2_gneiss.composition._method import (
ilr_hierarchical, ilr_phylogenetic, ilr_phylogenetic_ordination
ilr_hierarchical, ilr_phylogenetic, ilr_phylogenetic_ordination,
ilr_phylogenetic_posterior_differential,
)
from q2_gneiss._util import add_pseudocount

Expand Down Expand Up @@ -153,6 +154,25 @@ def test_ilr_ordination(self):
exp_md.index.name = 'featureid'
pdt.assert_frame_equal(res_md, exp_md)

def test_ilr_tensor(self):
import xarray as xr
data = np.array(
[[0, 0, 1, 1],
[0, 0, 1, 2],
[0, 0, 1, 3]]
).T
# TODO: the order of the dims really matters here
dataset = xr.DataArray(
data,
dims=['features', 'monte_carlo_samples'],
coords=[['a', 'b', 'c', 'd'], [0, 1, 2]]
)
dataset.name = 'diff'
dataset = dataset.to_dataset()
tree = TreeNode.read([u"(((b,a)f, c),d)r;"])
ilr_phylogenetic_posterior_differential(dataset, tree, True)
ilr_phylogenetic_posterior_differential(dataset, tree, False)


if __name__ == '__main__':
unittest.main()