BUG/TST: Add back in data matching/checking code

Closes biocore#139, for real this time. Eventually we'll need to check that feature metadata matches up, but that is its own problem for later down the road.
fedarko · Apr 7, 2020 · 1619a80 · 1619a80
1 parent cb25912
commit 1619a80
Show file tree

Hide file tree

Showing 3 changed files with 248 additions and 16 deletions.
diff --git a/empress/_plot.py b/empress/_plot.py
@@ -30,23 +30,35 @@ def plot(output_dir: str,
          sample_metadata: qiime2.Metadata,
          feature_metadata: qiime2.Metadata = None) -> None:
 
+    # 1. Convert inputs to the formats we want
+
     # TODO: do not ignore the feature metadata when specified by the user
     if feature_metadata is not None:
         feature_metadata = feature_metadata.to_dataframe()
 
+    sample_metadata = sample_metadata.to_dataframe()
+
     # create/parse tree
     tree_file = str(tree)
     # path to the actual newick file
     with open(tree_file) as file:
         t = parse_newick(file.readline())
+    empress_tree = Tree.from_tree(to_skbio_treenode(t))
+    tools.name_internal_nodes(empress_tree)
+
+    # 2. Now that we've converted/read/etc. all of the four input sources,
+    # ensure that the samples and features they describe "match up" sanely.
+
+    feature_table, sample_metadata = tools.match_inputs(
+        empress_tree, feature_table, sample_metadata, feature_metadata
+    )
+
+    # 3. Go forward with creating the Empress visualization!
 
     # extract balance parenthesis
     bp_tree = list(t.B)
 
-    # calculate tree coordinates
-    empress_tree = Tree.from_tree(to_skbio_treenode(t))
-    tools.name_internal_nodes(empress_tree)
-
+    # Compute coordinates resulting from layout algorithm(s)
     # TODO: figure out implications of screen size
     layout_to_coordsuffix, default_layout = empress_tree.coords(4020, 4020)
 
@@ -83,27 +95,22 @@ def plot(output_dir: str,
     env = Environment(loader=FileSystemLoader(TEMPLATES))
     temp = env.get_template('empress-template.html')
 
-    # sample metadata
-    sample_data = sample_metadata \
-        .to_dataframe().filter(feature_table.index, axis=0) \
-        .to_dict(orient='index')
+    # Convert sample metadata to a JSON-esque format
+    sample_data = sample_metadata.to_dict(orient='index')
 
     # TODO: Empress is currently storing all metadata as strings. This is
-    # memory intensive and wont scale well. We should convert all numeric
+    # memory intensive and won't scale well. We should convert all numeric
     # data/compress metadata.
 
     # This is used in biom-table. Currently this is only used to ignore null
-    # data (i.e. NaN and "unknown") and also determines sorting order.
-    # The original intent is to signal what
-    # columns are discrete/continous.
+    # data (i.e. NaN and "unknown" and also determines sorting order. The
+    # original intent is to signal what columns are discrete/continuous.
     # type of sample metadata (n - number, o - object)
-    sample_data_type = sample_metadata \
-        .to_dataframe().filter(feature_table.index, axis=0) \
-        .dtypes \
-        .to_dict()
+    sample_data_type = sample_metadata.dtypes.to_dict()
     sample_data_type = {k: 'n' if pd.api.types.is_numeric_dtype(v) else 'o'
                         for k, v in sample_data_type.items()}
 
+
     # create a mapping of observation ids and the samples that contain them
     obs_data = {}
     feature_table = (feature_table > 0).T

diff --git a/empress/tools.py b/empress/tools.py
@@ -2,6 +2,10 @@
 from skbio import TreeNode
 
 
+class DataMatchingError(Exception):
+    pass
+
+
 def name_internal_nodes(tree):
     """ Name internal nodes that don't have a name
 
@@ -35,3 +39,164 @@ def read(file_name, file_format='newick'):
         tree = skbio.read(file_name, file_format, into=TreeNode)
         return tree
     return None
+
+
+def print_if_dropped(
+    df_old, df_new, axis_num, item_name, df_name, filter_basis_name
+):
+    """Prints a message if a given DataFrame has been filtered.
+
+       Essentially, this function just checks if
+       df_old.shape[axis_num] - df_new.shape[axis_num] > 0.
+
+       If so, this prints a message with a bunch of details (which the _name
+       parameters all describe).
+
+    Parameters
+    ----------
+
+    df_old: pd.DataFrame (or pd.SparseDataFrame)
+         "Unfiltered" DataFrame -- used as the reference when trying to
+         determine if df_new has been filtered.
+
+    df_new: pd.DataFrame (or pd.SparseDataFrame)
+         A potentially-filtered DataFrame.
+
+    axis_num: int
+         The axis in the DataFrames' .shapes to check. This should be either
+         0 or 1, but we don't explicitly check for that.
+
+    item_name: str
+         The name of the "thing" described by the given axis in these
+         DataFrames. In practice, this is either "sample" or "feature".
+
+    df_name: str
+         The name of the DataFrame represented by df_old and df_new.
+
+    filter_basis_name: str
+         The name of the other DataFrame which caused these items to be
+         dropped. For example, if we're checking to see if samples were
+         dropped from the sample metadata file due to to samples not being
+         in the table, df_name could be "sample metadata file" and
+         filter_basis_name could be "table".
+
+    References
+    ----------
+
+    This function was adapted from Qurro's source code:
+    https://github.com/biocore/qurro/blob/b9613534b2125c2e7ee22e79fdff311812f4fefe/qurro/_df_utils.py#L203
+    """
+
+    dropped_item_ct = df_old.shape[axis_num] - df_new.shape[axis_num]
+    if dropped_item_ct > 0:
+        print(
+            "{} {}(s) in the {} were not present in the {}.".format(
+                dropped_item_ct, item_name, df_name, filter_basis_name
+            )
+        )
+        print(
+            "These {}(s) have been removed from the "
+            "visualization.".format(item_name)
+        )
+
+
+def match_inputs(tree, table, sample_metadata, feature_metadata=None):
+    """Matches various input sources.
+
+    Parameters
+    ----------
+
+    tree: empress.tree.Tree
+        The tree to be visualized.
+
+    table: pd.DataFrame
+        Representation of the feature table (containing features' abundances in
+        samples).
+
+    sample_metadata: pd.DataFrame
+        Sample metadata. The index should describe sample IDs; the columns
+        should describe different sample metadata fields' names.
+
+    feature_metadata: pd.DataFrame or None
+        Feature metadata. If this is passed, the index should describe feature
+        IDs and the columns should describe different feature metadata fields'
+        names.
+
+    Returns
+    -------
+
+    (table, sample_metadata): (pd.DataFrame, pd.DataFrame)
+        Versions of the input table and sample metadata filtered such that:
+            -The table only contains samples also present in the sample
+             metadata.
+            -The table only contains features also present in the tree.
+            -The sample metadata only contains samples also present in the
+             table.
+
+    Raises
+    ------
+
+    DataMatchingError
+        If any of the following conditions are met:
+            -No features are shared between the tree and table.
+            -No samples are shared between the sample metadata and table.
+
+    References
+    ----------
+
+    This function was based on match_table_and_data() in Qurro's code:
+    https://github.com/biocore/qurro/blob/b9613534b2125c2e7ee22e79fdff311812f4fefe/qurro/_df_utils.py#L255
+    """
+    # Match table and tree
+    # NOTE: This may be slow for huge trees / tables, could likely be optimized
+    tree_node_names = [n.name for n in tree.preorder()]
+    tree_and_table_features = set(tree_node_names) & set(table.index)
+
+    if len(tree_and_table_features) == 0:
+        raise DataMatchingError(
+            "No features are shared between the tree's nodes and the feature "
+            "table."
+        )
+
+    # Filter table to just features that are also present in the tree
+    # Note that we *don't* filter the tree analogously, because we want to draw
+    # the whole tree (that being said the Empress UI supports just showing
+    # features in the table, anyway)
+    ff_table = table.loc[tree_and_table_features]
+
+    # Report to user about any dropped samples from table
+    print_if_dropped(table, ff_table, 0, "feature", "table", "tree")
+
+    # Match table and sample metadata
+    sample_metadata_t = sample_metadata.T
+    sf_ff_table, sf_sample_metadata_t = ff_table.align(
+        sample_metadata_t, axis="columns", join="inner"
+    )
+    # At this point, the columns of f_table and f_sample_metadata_t should be
+    # filtered to just the shared samples.
+    sf_sample_metadata = sf_sample_metadata_t.T
+
+    # Check that at least 1 sample is shared between the s. metadata and table
+    if sf_sample_metadata.shape[0] < 1:
+        raise DataMatchingError(
+            "No samples are shared between the sample metadata file and the "
+            "feature table."
+        )
+    # Report to user about any dropped samples from s. metadata and/or table
+    print_if_dropped(
+        sample_metadata,
+        sf_sample_metadata,
+        0,
+        "sample",
+        "sample metadata file",
+        "table",
+    )
+    print_if_dropped(
+        table,
+        sf_ff_table,
+        1,
+        "sample",
+        "table",
+        "sample metadata file",
+    )
+    return sf_ff_table, sf_sample_metadata
diff --git a/tests/python/test_tools.py b/tests/python/test_tools.py
@@ -5,6 +5,7 @@
 #
 # ----------------------------------------------------------------------------
 import unittest
+import pandas as pd
 from skbio import TreeNode
 from empress import Tree
 import empress.tools as tools
@@ -17,6 +18,28 @@ def mock_tree_from_nwk(self):
 
     def setUp(self):
         self.tree = self.mock_tree_from_nwk()
+        # Test table/metadata (mostly) adapted from Qurro:
+        # https://github.com/biocore/qurro/blob/b9613534b2125c2e7ee22e79fdff311812f4fefe/qurro/tests/test_df_utils.py#L178
+        self.table = pd.DataFrame(
+            {
+                "Sample1": [1, 2, 3, 4, 5, 6, 7, 8],
+                "Sample2": [8, 7, 6, 5, 4, 3, 2, 1],
+                "Sample3": [1, 0, 0, 0, 0, 0, 0, 0],
+                "Sample4": [0, 0, 0, 1, 0, 0, 0, 0]
+            },
+            index=["a", "c", "e", "d", "b", "x", "y", "z"]
+        )
+        self.sample_metadata = pd.DataFrame(
+            {
+                "Metadata1": [0, 0, 0, 1],
+                "Metadata2": [0, 0, 0, 0],
+                "Metadata3": [1, 2, 3, 4],
+                "Metadata4": ["abc", "def", "ghi", "jkl"]
+            },
+            index=list(self.table.columns)[:]
+        )
+        # TODO Also test matching feature metadata, when that's supported
+        self.feature_metadata = None
 
     def test_name_internal_nodes(self):
         t = Tree.from_tree(self.tree)
@@ -26,6 +49,43 @@ def test_name_internal_nodes(self):
         for i, node in enumerate(t.postorder()):
             self.assertEqual(node.name, names[i])
 
+    def test_match_inputs_basic(self):
+        """Tests the basic case where no samples are dropped, but some features
+           are present in the table but not the tree.
+        """
+        t = Tree.from_tree(self.tree)
+        tools.name_internal_nodes(t)
+        filtered_tbl, filtered_sample_metadata = tools.match_inputs(
+            t, self.table, self.sample_metadata
+        )
+        # No samples should've been dropped with this example data.
+        self.assertCountEqual(filtered_tbl.columns, self.table.columns)
+        self.assertCountEqual(
+            filtered_sample_metadata.index, self.sample_metadata.index
+        )
+        # Just for the sake of sanity, make sure we didn't accidentally drop
+        # any sample metadata columns
+        self.assertCountEqual(
+            filtered_sample_metadata.columns, self.sample_metadata.columns
+        )
+        # Some features should've been dropped from the table:
+        # "a", "b", "e", and "d" are the only features present in both the
+        # table and tree.
+        self.assertCountEqual(filtered_tbl.index, ["a", "b", "e", "d"])
+        # TODO: ensure that dropped-feature message is printed
+
+    def test_match_inputs_no_shared_samples(self):
+        t = Tree.from_tree(self.tree)
+        tools.name_internal_nodes(t)
+        bad_sample_metadata = self.sample_metadata.copy()
+        bad_sample_metadata.index = ["lol", "nothing", "here", "matches"]
+        with self.assertRaisesRegex(
+            tools.DataMatchingError,
+            "No samples are shared between the sample metadata file and the "
+            "feature table."
+        ):
+            tools.match_inputs(t, self.table, bad_sample_metadata)
+
 
 if __name__ == "__main__":
     unittest.main()