biocore · fedarko · Jul 10, 2020 · Jul 8, 2020 · Jul 8, 2020 · Jul 8, 2020
diff --git a/.gitignore b/.gitignore
@@ -142,3 +142,8 @@ dmypy.json
 # dev scripts/data
 inf/
 create-plt.bash
+
+# vi swap and temp files
+*.swp
+*.swo
+*~
diff --git a/Makefile b/Makefile
@@ -54,4 +54,5 @@ docs:
 		--i-pcoa docs/moving-pictures/unweighted_unifrac_pcoa_results.qza \
 		--m-sample-metadata-file docs/moving-pictures/sample_metadata.tsv \
 		--m-feature-metadata-file docs/moving-pictures/taxonomy.qza \
-		--o-visualization docs/moving-pictures/empress-tree-tandem.qzv
+		--o-visualization docs/moving-pictures/empress-tree-tandem.qzv \
+		--p-filter-extra-samples
diff --git a/README.md b/README.md
@@ -58,6 +58,29 @@ This generates a visualization of a phylogenetic tree at
 
 ### Example 2: Using Empress to visualize a tree in tandem with an ordination
 
+**Note**: When your ordination was created from a subset of your original
+dataset (e.g. the feature table was rarefied, or certain low-frequency features
+or samples were otherwise filtered out), we recommend that you carefully
+consider *which* feature table you would like to visualize in Empress. You can
+use either:
+
+- A *filtered table* that matches the ordination (e.g. with rarefaction done,
+  and/or with low-abundance features/samples removed), or
+- A *raw table* -- that is, the original table before performing
+  rarefaction/filtering for the ordination.
+
+There are some pros and cons for either of these choices. If you use a
+*filtered table*, then the Empress visualization will include less data than in
+the *raw dataset*: this will impact sample presence information, sample
+metadata coloring, and other parts of the visualization. If you select the *raw
+table*, you might find that some nodes in the tree won't be represented by any
+of the samples in the ordination (if the ordination was made using a *filtered
+table*). If you'd like to read more about this, there's some informal
+discussion in [pull request 237](https://github.com/biocore/empress/pull/237).
+
+The command below uses the *raw dataset* and removes extra samples not
+represented in the ordination (using the `--p-filter-extra-samples` flag):
+
 ```bash
 qiime empress plot \
     --i-tree docs/moving-pictures/rooted-tree.qza \
@@ -66,6 +89,7 @@ qiime empress plot \
     --m-sample-metadata-file docs/moving-pictures/sample_metadata.tsv \
     --m-feature-metadata-file docs/moving-pictures/taxonomy.qza \
     --o-visualization docs/moving-pictures/empress-tree-tandem.qzv
+    --p-filter-extra-samples
 ```
 
 This generates a visualization of a phylogenetic tree alongside a visualization

diff --git a/empress/_plot.py b/empress/_plot.py
@@ -29,6 +29,7 @@ def plot(output_dir: str, tree: NewickFormat, feature_table: pd.DataFrame,
          sample_metadata: qiime2.Metadata, pcoa: OrdinationResults = None,
          feature_metadata: qiime2.Metadata = None,
          ignore_missing_samples: bool = False,
+         filter_extra_samples: bool = False,
          filter_missing_features: bool = False,
          number_of_features: int = 5,
          filter_unobserved_features_from_phylogeny: bool = True) -> None:
@@ -56,6 +57,7 @@ def plot(output_dir: str, tree: NewickFormat, feature_table: pd.DataFrame,
                   sample_metadata=sample_metadata,
                   feature_metadata=feature_metadata, ordination=pcoa,
                   ignore_missing_samples=ignore_missing_samples,
+                  filter_extra_samples=filter_extra_samples,
                   filter_missing_features=filter_missing_features,
                   filter_unobserved_features_from_phylogeny=trim_tree)
 

diff --git a/empress/core.py b/empress/core.py
@@ -29,8 +29,8 @@
 class Empress():
     def __init__(self, tree, table, sample_metadata,
                  feature_metadata=None, ordination=None,
-                 ignore_missing_samples=False, filter_missing_features=False,
-                 resource_path=None,
+                 ignore_missing_samples=False, filter_extra_samples=False,
+                 filter_missing_features=False, resource_path=None,
                  filter_unobserved_features_from_phylogeny=True):
         """Visualize a phylogenetic tree
 
@@ -63,6 +63,10 @@ def __init__(self, tree, table, sample_metadata,
             out; and if no samples are shared between the table and metadata, a
             DataMatchingError is raised regardless.) This is analogous to the
             ignore_missing_samples flag in Emperor.
+        filter_extra_samples: bool, optional (default False)
+            If True, ignores samples in the feature table that are not present
+            in the ordination. If False, raises a DataMatchingError if such
+            samples exist.
         filter_missing_features: bool, optional (default False)
             If True, filters features from the table that aren't present as
             tips in the tree. If False, raises a DataMatchingError if any such
@@ -110,6 +114,7 @@ def __init__(self, tree, table, sample_metadata,
 
         self._validate_and_match_data(
             ignore_missing_samples,
+            filter_extra_samples,
             filter_missing_features,
             filter_unobserved_features_from_phylogeny
         )
@@ -128,6 +133,7 @@ def __init__(self, tree, table, sample_metadata,
             self._emperor = None
 
     def _validate_and_match_data(self, ignore_missing_samples,
+                                 filter_extra_samples,
                                  filter_missing_features,
                                  filter_unobserved_features_from_phylogeny):
 
@@ -138,7 +144,8 @@ def _validate_and_match_data(self, ignore_missing_samples,
         # table for the rest of this visualizer.
         self.table, self.samples, self.tip_md, self.int_md = match_inputs(
             self.tree, self.table.T, self.samples, self.features,
-            ignore_missing_samples, filter_missing_features
+            self.ordination, ignore_missing_samples, filter_extra_samples,
+            filter_missing_features
         )
         # remove unobserved features from the phylogeny
         if filter_unobserved_features_from_phylogeny:

diff --git a/empress/plugin_setup.py b/empress/plugin_setup.py
@@ -39,6 +39,7 @@
         'sample_metadata': Metadata,
         'feature_metadata': Metadata,
         'ignore_missing_samples': Bool,
+        'filter_extra_samples': Bool,
         'filter_missing_features': Bool,
         'number_of_features': Int % Range(1, None),
         'filter_unobserved_features_from_phylogeny': Bool
@@ -78,6 +79,13 @@
             'metadata". Note that this flag will only be applied if at least '
             'one sample is present in both the feature table and the metadata.'
         ),
+        'filter_extra_samples': (
+            'This will suppress the error raised when samples in the feature '
+            'table are not included in the ordination. These samples will be '
+            'will be removed from the visualization if this flag is passed. '
+            'Note that this flag will only be applied if at least one sample '
+            'in the table is also present in the ordination.'
+        ),
         'filter_missing_features': (
             'This will suppress the error raised when the feature table '
             'contains features that are not present as tips in the tree. '

diff --git a/empress/tools.py b/empress/tools.py
@@ -60,6 +60,8 @@ def match_inputs(
     table,
     sample_metadata,
     feature_metadata=None,
+    ordination=None,
+    filter_extra_samples=False,
     ignore_missing_samples=False,
     filter_missing_features=False
 ):
@@ -87,6 +89,8 @@ def match_inputs(
         IDs and the columns should describe different feature metadata fields'
         names. (Feature IDs here can describe tips or internal nodes in the
         tree.)
+    ordination: skbio.OrdinationResults, optional
+        The ordination to display in a tandem plot.
     ignore_missing_samples: bool
         If True, pads missing samples (i.e. samples in the table but not the
         metadata) with placeholder metadata. If False, raises a
@@ -95,6 +99,10 @@ def match_inputs(
         no samples are shared between the table and metadata, a
         DataMatchingError is raised regardless.) This is analogous to the
         ignore_missing_samples flag in Emperor.
+    filter_extra_samples: bool, optional
+        If True, ignores samples in the feature table that are not present in
+        the ordination. If False, raises a DataMatchingError if such samples
+        exist.
     filter_missing_features: bool
         If True, filters features from the table that aren't present as tips in
         the tree. If False, raises a DataMatchingError if any such features
@@ -136,6 +144,8 @@ def match_inputs(
                metadata, AND ignore_missing_samples is False.
             5. The feature metadata was passed, but no features present in it
                are also present as tips or internal nodes in the tree.
+            6. The ordination AND feature table don't have exactly the same
+               samples.
 
     References
     ----------
@@ -146,16 +156,46 @@ def match_inputs(
     # (Ignore None-named tips in the tree, which will be replaced later on
     # with "default" names like "EmpressNode0".)
     tip_names = set(bp_tree_tips(bp_tree))
-    tree_and_table_features = table.index.intersection(tip_names)
+    ff_table = table.copy()
+
+    if ordination is not None:
+        table_ids = set(ff_table.columns)
+        ord_ids = set(ordination.samples.index)
+
+        # don't allow for disjoint datasets
+        if not (table_ids & ord_ids):
+            raise DataMatchingError(
+                "No samples in the feature table are present in the "
+                "ordination"
+            )
 
+        if ord_ids.issubset(table_ids):
+            extra = table_ids - ord_ids
+            if extra:
+                if not filter_extra_samples:
+                    raise DataMatchingError(
+                        "The feature table has more samples than the "
+                        "ordination. These are the problematic sample "
+                        "identifiers: %s. You can override this error by using"
+                        " the --p-filter-extra-samples flag." %
+                        (', '.join(sorted(extra)))
+                    )
+                ff_table = ff_table[ord_ids]
+                # remove empty features
+                ff_table = ff_table.loc[ff_table.sum(axis=1) > 0]
+        else:
+            raise DataMatchingError(
+                "The ordination has more samples than the feature table."
+            )
+
+    tree_and_table_features = ff_table.index.intersection(tip_names)
     if len(tree_and_table_features) == 0:
         # Error condition 1
         raise DataMatchingError(
             "No features in the feature table are present as tips in the tree."
         )
 
-    ff_table = table.copy()
-    if len(tree_and_table_features) < len(table.index):
+    if len(tree_and_table_features) < len(ff_table.index):
         if filter_missing_features:
             # Filter table to just features that are also present in the tree.
             #
@@ -164,7 +204,7 @@ def match_inputs(
             # (and this is going to be the case in most datasets where the
             # features correspond to tips, since internal nodes aren't
             # explicitly described in the feature table).
-            ff_table = table.loc[tree_and_table_features]
+            ff_table = ff_table.loc[tree_and_table_features]
 
             # Report to user about any dropped features from table.
             dropped_feature_ct = table.shape[0] - ff_table.shape[0]

diff --git a/tests/python/test_tools.py b/tests/python/test_tools.py
@@ -8,7 +8,7 @@
 import unittest
 import pandas as pd
 from pandas.testing import assert_frame_equal
-from skbio import TreeNode
+from skbio import TreeNode, OrdinationResults
 from empress import Tree, tools
 from empress.taxonomy_utils import split_taxonomy
 from bp import parse_newick, from_skbio_treenode
@@ -73,6 +73,25 @@ def setUp(self):
             "Level 7", "Confidence"
         ]
 
+        eigvals = pd.Series([0.50, 0.25, 0.25],
+                            index=['PC1', 'PC2', 'PC3'])
+        samples = [[0.1, 0.2, 0.3],
+                   [0.2, 0.3, 0.4],
+                   [0.3, 0.4, 0.5],
+                   [0.4, 0.5, 0.6]]
+        proportion_explained = pd.Series([15.5, 12.2, 8.8],
+                                         index=['PC1', 'PC2', 'PC3'])
+        samples_df = pd.DataFrame(samples,
+                                  index=['Sample1', 'Sample2', 'Sample3',
+                                         'Sample4'],
+                                  columns=['PC1', 'PC2', 'PC3'])
+        self.ordination = OrdinationResults(
+                'PCoA',
+                'Principal Coordinate Analysis',
+                eigvals,
+                samples_df,
+                proportion_explained=proportion_explained)
+
     def test_fill_missing_node_names(self):
         t = Tree.from_tree(self.tree)
         tools.fill_missing_node_names(t)
@@ -91,6 +110,20 @@ def test_match_inputs_nothing_dropped(self):
         self.assertIsNone(t_md)
         self.assertIsNone(i_md)
 
+    def test_match_inputs_nothing_dropped_with_ordination(self):
+        # everything is the same since the ordination has a 1:1 match to the
+        # feature table
+        filtered_table, filtered_sample_md, t_md, i_md = tools.match_inputs(
+            self.bp_tree, self.table, self.sample_metadata,
+            ordination=self.ordination
+        )
+
+        assert_frame_equal(filtered_table, self.table)
+        assert_frame_equal(filtered_sample_md, self.sample_metadata)
+        # We didn't pass in any feature metadata, so we shouldn't get any out
+        self.assertIsNone(t_md)
+        self.assertIsNone(i_md)
+
     def test_match_inputs_only_1_feature_in_table(self):
         # This is technically allowed (so long as this 1 feature is a tree tip)
         tiny_table = self.table.loc[["a"]]
@@ -381,6 +414,83 @@ def test_match_inputs_feature_metadata_only_internal_node_metadata(self):
         self.assertListEqual(list(t_fm.columns), self.exp_split_fm_cols)
         self.assertListEqual(list(i_fm.columns), self.exp_split_fm_cols)
 
+    def test_disjoint_table_and_ordination(self):
+        self.ordination.samples.index = pd.Index(['Zample1', 'Zample2',
+                                                  'Zample3', 'Zample4'])
+
+        with self.assertRaisesRegex(
+            tools.DataMatchingError,
+            "No samples in the feature table are present in the ordination"
+        ):
+            tools.match_inputs(self.bp_tree, self.table, self.sample_metadata,
+                               ordination=self.ordination)
+
+    def test_ordination_is_superset(self):
+        self.table = pd.DataFrame(
+            {
+                "Sample1": [1, 2, 0, 4],
+                "Sample2": [8, 7, 0, 5],
+                "Sample3": [1, 0, 0, 0],
+            },
+            index=["a", "b", "e", "d"]
+        )
+
+        with self.assertRaisesRegex(
+            tools.DataMatchingError,
+            "The ordination has more samples than the feature table"
+        ):
+            tools.match_inputs(self.bp_tree, self.table, self.sample_metadata,
+                               ordination=self.ordination)
+
+    def test_table_is_superset_raises(self):
+        self.table = pd.DataFrame(
+            {
+                "Sample1": [1, 2, 0, 4],
+                "Sample2": [8, 7, 0, 5],
+                "Sample3": [1, 0, 0, 0],
+                "Sample4": [1, 0, 0, 0],
+                "Sample5": [1, 0, 4, 0],
+            },
+            index=["a", "b", "e", "d"]
+        )
+
+        with self.assertRaisesRegex(
+            tools.DataMatchingError,
+            "The feature table has more samples than the ordination. These are"
+            " the problematic sample identifiers: Sample5. You can override "
+            "this error by using the --p-filter-extra-samples flag"
+        ):
+            tools.match_inputs(self.bp_tree, self.table, self.sample_metadata,
+                               ordination=self.ordination)
+
+    def test_table_is_superset_override_raises(self):
+        self.table = pd.DataFrame(
+            {
+                "Sample1": [1, 2, 0, 4],
+                "Sample2": [8, 7, 0, 5],
+                "Sample3": [1, 0, 0, 0],
+                "Sample4": [1, 0, 0, 0],
+                "Sample5": [1, 0, 4, 0],
+            },
+            index=["a", "b", "e", "d"]
+        )
+
+        filtered_table, filtered_sample_md, t_md, i_md = tools.match_inputs(
+            self.bp_tree, self.table, self.sample_metadata,
+            ordination=self.ordination, filter_extra_samples=True)
+
+        exp = self.table.loc[['a', 'b', 'd'],
+                             ['Sample1', 'Sample2', 'Sample3', 'Sample4']]
+
+        # guarantee the same sample-wise order
+        assert_frame_equal(filtered_table[exp.columns], exp)
+        assert_frame_equal(filtered_sample_md.loc[exp.columns],
+                           self.sample_metadata)
+
+        # We didn't pass in any feature metadata, so we shouldn't get any out
+        self.assertIsNone(t_md)
+        self.assertIsNone(i_md)
+
     def test_shifting(self):
         # helper test function to count number of bits in the number
         def _count_bits(n):