Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Match feature table and ordination #237

Merged
merged 15 commits into from
Jul 10, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,8 @@ dmypy.json
# dev scripts/data
inf/
create-plt.bash

# vi swap and temp files
*.swp
*.swo
*~
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ This generates a visualization of a phylogenetic tree at

### Example 2: Using Empress to visualize a tree in tandem with an ordination

**Note**: In tandem plots we recommend that users carefully consider which
feature table they would like to visualize. These can either be a *filtered
table* (rarefied or without low abundance features), or a *raw table*. If you
select a *filtered table* the tree will include less data than in the *raw
dataset*. If you select the *raw table*, you might find that some nodes in the
tree won't be represented by any of the samples in the ordination (if the
ordination was made using a *filtered table*).
ElDeveloper marked this conversation as resolved.
Show resolved Hide resolved

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

something i just thought of (sorry for not bringing this up earlier), but since the table.qza file is changed in this PR the README should be updated to match that, right? i.e. the "view"/download links should be altered at least (preferably with a sentence or 2 of explaining which table from the MP tutorial this is)

```bash
qiime empress plot \
--i-tree docs/moving-pictures/rooted-tree.qza \
Expand Down
Binary file modified docs/moving-pictures/table.qza
Binary file not shown.
2 changes: 2 additions & 0 deletions empress/_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def plot(output_dir: str, tree: NewickFormat, feature_table: pd.DataFrame,
sample_metadata: qiime2.Metadata, pcoa: OrdinationResults = None,
feature_metadata: qiime2.Metadata = None,
ignore_missing_samples: bool = False,
filter_extra_samples: bool = False,
filter_missing_features: bool = False,
number_of_features: int = 5,
filter_unobserved_features_from_phylogeny: bool = True) -> None:
Expand Down Expand Up @@ -56,6 +57,7 @@ def plot(output_dir: str, tree: NewickFormat, feature_table: pd.DataFrame,
sample_metadata=sample_metadata,
feature_metadata=feature_metadata, ordination=pcoa,
ignore_missing_samples=ignore_missing_samples,
filter_extra_samples=filter_extra_samples,
filter_missing_features=filter_missing_features,
filter_unobserved_features_from_phylogeny=trim_tree)

Expand Down
13 changes: 10 additions & 3 deletions empress/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
class Empress():
def __init__(self, tree, table, sample_metadata,
feature_metadata=None, ordination=None,
ignore_missing_samples=False, filter_missing_features=False,
resource_path=None,
ignore_missing_samples=False, filter_extra_samples=False,
filter_missing_features=False, resource_path=None,
filter_unobserved_features_from_phylogeny=True):
"""Visualize a phylogenetic tree

Expand Down Expand Up @@ -63,6 +63,10 @@ def __init__(self, tree, table, sample_metadata,
out; and if no samples are shared between the table and metadata, a
DataMatchingError is raised regardless.) This is analogous to the
ignore_missing_samples flag in Emperor.
filter_extra_samples: bool, optional (default False)
If True, ignores samples in the feature table that are not present
in the ordination. If false, raises a DataMatchingerror if such
ElDeveloper marked this conversation as resolved.
Show resolved Hide resolved
samples exist.
filter_missing_features: bool, optional (default False)
If True, filters features from the table that aren't present as
tips in the tree. If False, raises a DataMatchingError if any such
Expand Down Expand Up @@ -110,6 +114,7 @@ def __init__(self, tree, table, sample_metadata,

self._validate_and_match_data(
ignore_missing_samples,
filter_extra_samples,
filter_missing_features,
filter_unobserved_features_from_phylogeny
)
Expand All @@ -128,6 +133,7 @@ def __init__(self, tree, table, sample_metadata,
self._emperor = None

def _validate_and_match_data(self, ignore_missing_samples,
filter_extra_samples,
filter_missing_features,
filter_unobserved_features_from_phylogeny):

Expand All @@ -138,7 +144,8 @@ def _validate_and_match_data(self, ignore_missing_samples,
# table for the rest of this visualizer.
self.table, self.samples, self.tip_md, self.int_md = match_inputs(
self.tree, self.table.T, self.samples, self.features,
ignore_missing_samples, filter_missing_features
self.ordination, ignore_missing_samples, filter_extra_samples,
filter_missing_features
)
# remove unobserved features from the phylogeny
if filter_unobserved_features_from_phylogeny:
Expand Down
5 changes: 5 additions & 0 deletions empress/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
'sample_metadata': Metadata,
'feature_metadata': Metadata,
'ignore_missing_samples': Bool,
'filter_extra_samples': Bool,
'filter_missing_features': Bool,
'number_of_features': Int % Range(1, None),
'filter_unobserved_features_from_phylogeny': Bool
Expand Down Expand Up @@ -78,6 +79,10 @@
'metadata". Note that this flag will only be applied if at least '
'one sample is present in both the feature table and the metadata.'
),
'filter_extra_samples': (
'This will supress the error that\'s raised when samples in the '
'feature table are not included in the ordination.'
),
ElDeveloper marked this conversation as resolved.
Show resolved Hide resolved
'filter_missing_features': (
'This will suppress the error raised when the feature table '
'contains features that are not present as tips in the tree. '
Expand Down
47 changes: 43 additions & 4 deletions empress/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def match_inputs(
table,
sample_metadata,
feature_metadata=None,
ordination=None,
filter_extra_samples=False,
ignore_missing_samples=False,
filter_missing_features=False
):
Expand Down Expand Up @@ -87,6 +89,8 @@ def match_inputs(
IDs and the columns should describe different feature metadata fields'
names. (Feature IDs here can describe tips or internal nodes in the
tree.)
ordination: skbio.OrdinationResults, optional
The ordination to display in a tandem plot.
ignore_missing_samples: bool
If True, pads missing samples (i.e. samples in the table but not the
metadata) with placeholder metadata. If False, raises a
Expand All @@ -95,6 +99,10 @@ def match_inputs(
no samples are shared between the table and metadata, a
DataMatchingError is raised regardless.) This is analogous to the
ignore_missing_samples flag in Emperor.
filter_extra_samples: bool, optional
If True, ignores samples in the feature table that are not present in
the ordination. If false, raises a DataMatchingerror if such samples
ElDeveloper marked this conversation as resolved.
Show resolved Hide resolved
exist.
filter_missing_features: bool
If True, filters features from the table that aren't present as tips in
the tree. If False, raises a DataMatchingError if any such features
Expand Down Expand Up @@ -136,6 +144,8 @@ def match_inputs(
metadata, AND ignore_missing_samples is False.
5. The feature metadata was passed, but no features present in it
are also present as tips or internal nodes in the tree.
6. The ordination AND feature table don't have exactly the same
samples.

References
----------
Expand All @@ -146,16 +156,45 @@ def match_inputs(
# (Ignore None-named tips in the tree, which will be replaced later on
# with "default" names like "EmpressNode0".)
tip_names = set(bp_tree_tips(bp_tree))
tree_and_table_features = table.index.intersection(tip_names)
ff_table = table.copy()

if ordination is not None:
table_ids = set(ff_table.columns)
ord_ids = set(ordination.samples.index)

# don't allow for disjoint datasets
if not (table_ids & ord_ids):
raise DataMatchingError(
"No samples in the feature table are present in the "
"ordination"
)

if ord_ids.issubset(table_ids):
extra = table_ids - ord_ids
if extra and not filter_extra_samples:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it'd be clearer to replace if extra and not filter_extra_samples: with just an if extra: check, and then below that if not filter_extra_samples: and else: branches that handle the two branches here. I think this would make program flow a bit clearer, since extra has to be truthy in both cases?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call!

raise DataMatchingError(
"The feature table has more samples than the ordination. "
"These are the problematic sample identifiers: %s. You can"
" override this error by using the "
"--p-filter-extra-samples flag" %
ElDeveloper marked this conversation as resolved.
Show resolved Hide resolved
(', '.join(sorted(extra)))
)
elif extra:
ff_table = ff_table[ord_ids]
ff_table = ff_table.loc[ff_table.sum(axis=1) > 0]
fedarko marked this conversation as resolved.
Show resolved Hide resolved
fedarko marked this conversation as resolved.
Show resolved Hide resolved
else:
raise DataMatchingError(
"The ordination has more samples than the feature table"
ElDeveloper marked this conversation as resolved.
Show resolved Hide resolved
)

tree_and_table_features = ff_table.index.intersection(tip_names)
if len(tree_and_table_features) == 0:
# Error condition 1
raise DataMatchingError(
"No features in the feature table are present as tips in the tree."
)

ff_table = table.copy()
if len(tree_and_table_features) < len(table.index):
if len(tree_and_table_features) < len(ff_table.index):
if filter_missing_features:
# Filter table to just features that are also present in the tree.
#
Expand All @@ -164,7 +203,7 @@ def match_inputs(
# (and this is going to be the case in most datasets where the
# features correspond to tips, since internal nodes aren't
# explicitly described in the feature table).
ff_table = table.loc[tree_and_table_features]
ff_table = ff_table.loc[tree_and_table_features]

# Report to user about any dropped features from table.
dropped_feature_ct = table.shape[0] - ff_table.shape[0]
Expand Down
112 changes: 111 additions & 1 deletion tests/python/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import unittest
import pandas as pd
from pandas.testing import assert_frame_equal
from skbio import TreeNode
from skbio import TreeNode, OrdinationResults
from empress import Tree, tools
from empress.taxonomy_utils import split_taxonomy
from bp import parse_newick, from_skbio_treenode
Expand Down Expand Up @@ -73,6 +73,25 @@ def setUp(self):
"Level 7", "Confidence"
]

eigvals = pd.Series([0.50, 0.25, 0.25],
index=['PC1', 'PC2', 'PC3'])
samples = [[0.1, 0.2, 0.3],
[0.2, 0.3, 0.4],
[0.3, 0.4, 0.5],
[0.4, 0.5, 0.6]]
proportion_explained = pd.Series([15.5, 12.2, 8.8],
index=['PC1', 'PC2', 'PC3'])
samples_df = pd.DataFrame(samples,
index=['Sample1', 'Sample2', 'Sample3',
'Sample4'],
columns=['PC1', 'PC2', 'PC3'])
self.ordination = OrdinationResults(
'PCoA',
'Principal Coordinate Analysis',
eigvals,
samples_df,
proportion_explained=proportion_explained)

def test_fill_missing_node_names(self):
t = Tree.from_tree(self.tree)
tools.fill_missing_node_names(t)
Expand All @@ -91,6 +110,20 @@ def test_match_inputs_nothing_dropped(self):
self.assertIsNone(t_md)
self.assertIsNone(i_md)

def test_match_inputs_nothing_dropped_with_ordination(self):
# everything is the same since the ordination has a 1:1 match to the
# feature table
filtered_table, filtered_sample_md, t_md, i_md = tools.match_inputs(
self.bp_tree, self.table, self.sample_metadata,
ordination=self.ordination
)

assert_frame_equal(filtered_table, self.table)
assert_frame_equal(filtered_sample_md, self.sample_metadata)
# We didn't pass in any feature metadata, so we shouldn't get any out
self.assertIsNone(t_md)
self.assertIsNone(i_md)

def test_match_inputs_only_1_feature_in_table(self):
# This is technically allowed (so long as this 1 feature is a tree tip)
tiny_table = self.table.loc[["a"]]
Expand Down Expand Up @@ -381,6 +414,83 @@ def test_match_inputs_feature_metadata_only_internal_node_metadata(self):
self.assertListEqual(list(t_fm.columns), self.exp_split_fm_cols)
self.assertListEqual(list(i_fm.columns), self.exp_split_fm_cols)

def test_disjoint_table_and_ordination(self):
self.ordination.samples.index = pd.Index(['Zample1', 'Zample2',
'Zample3', 'Zample4'])

with self.assertRaisesRegex(
tools.DataMatchingError,
"No samples in the feature table are present in the ordination"
):
tools.match_inputs(self.bp_tree, self.table, self.sample_metadata,
ordination=self.ordination)

def test_ordination_is_superset(self):
self.table = pd.DataFrame(
{
"Sample1": [1, 2, 0, 4],
"Sample2": [8, 7, 0, 5],
"Sample3": [1, 0, 0, 0],
},
index=["a", "b", "e", "d"]
)

with self.assertRaisesRegex(
tools.DataMatchingError,
"The ordination has more samples than the feature table"
):
tools.match_inputs(self.bp_tree, self.table, self.sample_metadata,
ordination=self.ordination)

def test_table_is_superset_raises(self):
self.table = pd.DataFrame(
{
"Sample1": [1, 2, 0, 4],
"Sample2": [8, 7, 0, 5],
"Sample3": [1, 0, 0, 0],
"Sample4": [1, 0, 0, 0],
"Sample5": [1, 0, 4, 0],
},
index=["a", "b", "e", "d"]
)

with self.assertRaisesRegex(
tools.DataMatchingError,
"The feature table has more samples than the ordination. These are"
" the problematic sample identifiers: Sample5. You can override "
"this error by using the --p-filter-extra-samples flag"
):
tools.match_inputs(self.bp_tree, self.table, self.sample_metadata,
ordination=self.ordination)

def test_table_is_superset_override_raises(self):
self.table = pd.DataFrame(
{
"Sample1": [1, 2, 0, 4],
"Sample2": [8, 7, 0, 5],
"Sample3": [1, 0, 0, 0],
"Sample4": [1, 0, 0, 0],
"Sample5": [1, 0, 4, 0],
},
index=["a", "b", "e", "d"]
)

filtered_table, filtered_sample_md, t_md, i_md = tools.match_inputs(
self.bp_tree, self.table, self.sample_metadata,
ordination=self.ordination, filter_extra_samples=True)

exp = self.table.loc[['a', 'b', 'd'],
['Sample1', 'Sample2', 'Sample3', 'Sample4']]

# guarantee the same sample-wise order
assert_frame_equal(filtered_table[exp.columns], exp)
assert_frame_equal(filtered_sample_md.loc[exp.columns],
self.sample_metadata)

# We didn't pass in any feature metadata, so we shouldn't get any out
self.assertIsNone(t_md)
self.assertIsNone(i_md)

def test_shifting(self):
# helper test function to count number of bits in the number
def _count_bits(n):
Expand Down