Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Match feature table and ordination #237

Merged
merged 15 commits into from
Jul 10, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion empress/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def _validate_and_match_data(self, ignore_missing_samples,
# table for the rest of this visualizer.
self.table, self.samples, self.tip_md, self.int_md = match_inputs(
self.tree, self.table.T, self.samples, self.features,
ignore_missing_samples, filter_missing_features
self.ordination, ignore_missing_samples, filter_missing_features
)
# remove unobserved features from the phylogeny
if filter_unobserved_features_from_phylogeny:
Expand Down
16 changes: 16 additions & 0 deletions empress/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def match_inputs(
table,
sample_metadata,
feature_metadata=None,
ordination=None,
ignore_missing_samples=False,
filter_missing_features=False
):
Expand Down Expand Up @@ -87,6 +88,8 @@ def match_inputs(
IDs and the columns should describe different feature metadata fields'
names. (Feature IDs here can describe tips or internal nodes in the
tree.)
ordination: skbio.OrdinationResults, optional
The ordination to display in a tandem plot.
ignore_missing_samples: bool
If True, pads missing samples (i.e. samples in the table but not the
metadata) with placeholder metadata. If False, raises a
Expand Down Expand Up @@ -136,6 +139,8 @@ def match_inputs(
metadata, AND ignore_missing_samples is False.
5. The feature metadata was passed, but no features present in it
are also present as tips or internal nodes in the tree.
6. The ordination AND feature table don't have exactly the same
samples.

References
----------
Expand Down Expand Up @@ -271,6 +276,17 @@ def match_inputs(
"either as tips or as internal nodes."
)

if ordination is not None:
# tandem plots require a 1-1 match between feature table and ordination
mismatched = set(ordination.samples.index) ^ set(ff_table.columns)

if mismatched:
raise DataMatchingError(
"The feature table does not have exactly the same samples as "
"the ordination. These are the problematic sample identifiers:"
" %s" % (', '.join(sorted(mismatched)))
)

return ff_table, sf_sample_metadata, tip_metadata, int_metadata


Expand Down
48 changes: 47 additions & 1 deletion tests/python/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import unittest
import pandas as pd
from pandas.testing import assert_frame_equal
from skbio import TreeNode
from skbio import TreeNode, OrdinationResults
from empress import Tree, tools
from empress.taxonomy_utils import split_taxonomy
from bp import parse_newick, from_skbio_treenode
Expand Down Expand Up @@ -73,6 +73,25 @@ def setUp(self):
"Level 7", "Confidence"
]

eigvals = pd.Series([0.50, 0.25, 0.25],
index=['PC1', 'PC2', 'PC3'])
samples = [[0.1, 0.2, 0.3],
[0.2, 0.3, 0.4],
[0.3, 0.4, 0.5],
[0.4, 0.5, 0.6]]
proportion_explained = pd.Series([15.5, 12.2, 8.8],
index=['PC1', 'PC2', 'PC3'])
samples_df = pd.DataFrame(samples,
index=['Sample1', 'Sample2', 'Sample3',
'Sample4'],
columns=['PC1', 'PC2', 'PC3'])
self.ordination = OrdinationResults(
'PCoA',
'Principal Coordinate Analysis',
eigvals,
samples_df,
proportion_explained=proportion_explained)

def test_fill_missing_node_names(self):
t = Tree.from_tree(self.tree)
tools.fill_missing_node_names(t)
Expand All @@ -91,6 +110,20 @@ def test_match_inputs_nothing_dropped(self):
self.assertIsNone(t_md)
self.assertIsNone(i_md)

def test_match_inputs_nothing_dropped_with_ordination(self):
# everything is the same since the ordination has a 1:1 match to the
# feature table
filtered_table, filtered_sample_md, t_md, i_md = tools.match_inputs(
self.bp_tree, self.table, self.sample_metadata,
ordination=self.ordination
)

assert_frame_equal(filtered_table, self.table)
assert_frame_equal(filtered_sample_md, self.sample_metadata)
# We didn't pass in any feature metadata, so we shouldn't get any out
self.assertIsNone(t_md)
self.assertIsNone(i_md)

def test_match_inputs_only_1_feature_in_table(self):
# This is technically allowed (so long as this 1 feature is a tree tip)
tiny_table = self.table.loc[["a"]]
Expand Down Expand Up @@ -381,6 +414,19 @@ def test_match_inputs_feature_metadata_only_internal_node_metadata(self):
self.assertListEqual(list(t_fm.columns), self.exp_split_fm_cols)
self.assertListEqual(list(i_fm.columns), self.exp_split_fm_cols)

def test_failed_match_to_ordination(self):
self.ordination.samples.index = pd.Index(['Sample1', 'Zample2',
'Sample3', 'Sample4'])

with self.assertRaisesRegex(
tools.DataMatchingError,
"The feature table does not have exactly the same samples as the "
"ordination. These are the problematic sample identifiers: Sample2"
", Zample2"
):
tools.match_inputs(self.bp_tree, self.table, self.sample_metadata,
ordination=self.ordination)

def test_shifting(self):
# helper test function to count number of bits in the number
def _count_bits(n):
Expand Down