Skip to content

Commit

Permalink
BUG/TST: Add back in data matching/checking code
Browse files Browse the repository at this point in the history
Closes biocore#139, for real this time.

Eventually we'll need to check that feature metadata matches up, but
that is its own problem for later down the road.
  • Loading branch information
fedarko committed Apr 7, 2020
1 parent cb25912 commit 1619a80
Show file tree
Hide file tree
Showing 3 changed files with 248 additions and 16 deletions.
39 changes: 23 additions & 16 deletions empress/_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,35 @@ def plot(output_dir: str,
sample_metadata: qiime2.Metadata,
feature_metadata: qiime2.Metadata = None) -> None:

# 1. Convert inputs to the formats we want

# TODO: do not ignore the feature metadata when specified by the user
if feature_metadata is not None:
feature_metadata = feature_metadata.to_dataframe()

sample_metadata = sample_metadata.to_dataframe()

# create/parse tree
tree_file = str(tree)
# path to the actual newick file
with open(tree_file) as file:
t = parse_newick(file.readline())
empress_tree = Tree.from_tree(to_skbio_treenode(t))
tools.name_internal_nodes(empress_tree)

# 2. Now that we've converted/read/etc. all of the four input sources,
# ensure that the samples and features they describe "match up" sanely.

feature_table, sample_metadata = tools.match_inputs(
empress_tree, feature_table, sample_metadata, feature_metadata
)

# 3. Go forward with creating the Empress visualization!

# extract balance parenthesis
bp_tree = list(t.B)

# calculate tree coordinates
empress_tree = Tree.from_tree(to_skbio_treenode(t))
tools.name_internal_nodes(empress_tree)

# Compute coordinates resulting from layout algorithm(s)
# TODO: figure out implications of screen size
layout_to_coordsuffix, default_layout = empress_tree.coords(4020, 4020)

Expand Down Expand Up @@ -83,27 +95,22 @@ def plot(output_dir: str,
env = Environment(loader=FileSystemLoader(TEMPLATES))
temp = env.get_template('empress-template.html')

# sample metadata
sample_data = sample_metadata \
.to_dataframe().filter(feature_table.index, axis=0) \
.to_dict(orient='index')
# Convert sample metadata to a JSON-esque format
sample_data = sample_metadata.to_dict(orient='index')

# TODO: Empress is currently storing all metadata as strings. This is
# memory intensive and wont scale well. We should convert all numeric
# memory intensive and won't scale well. We should convert all numeric
# data/compress metadata.

# This is used in biom-table. Currently this is only used to ignore null
# data (i.e. NaN and "unknown") and also determines sorting order.
# The original intent is to signal what
# columns are discrete/continous.
# data (i.e. NaN and "unknown" and also determines sorting order. The
# original intent is to signal what columns are discrete/continuous.
# type of sample metadata (n - number, o - object)
sample_data_type = sample_metadata \
.to_dataframe().filter(feature_table.index, axis=0) \
.dtypes \
.to_dict()
sample_data_type = sample_metadata.dtypes.to_dict()
sample_data_type = {k: 'n' if pd.api.types.is_numeric_dtype(v) else 'o'
for k, v in sample_data_type.items()}


# create a mapping of observation ids and the samples that contain them
obs_data = {}
feature_table = (feature_table > 0).T
Expand Down
165 changes: 165 additions & 0 deletions empress/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
from skbio import TreeNode


class DataMatchingError(Exception):
pass


def name_internal_nodes(tree):
""" Name internal nodes that don't have a name
Expand Down Expand Up @@ -35,3 +39,164 @@ def read(file_name, file_format='newick'):
tree = skbio.read(file_name, file_format, into=TreeNode)
return tree
return None


def print_if_dropped(
df_old, df_new, axis_num, item_name, df_name, filter_basis_name
):
"""Prints a message if a given DataFrame has been filtered.
Essentially, this function just checks if
df_old.shape[axis_num] - df_new.shape[axis_num] > 0.
If so, this prints a message with a bunch of details (which the _name
parameters all describe).
Parameters
----------
df_old: pd.DataFrame (or pd.SparseDataFrame)
"Unfiltered" DataFrame -- used as the reference when trying to
determine if df_new has been filtered.
df_new: pd.DataFrame (or pd.SparseDataFrame)
A potentially-filtered DataFrame.
axis_num: int
The axis in the DataFrames' .shapes to check. This should be either
0 or 1, but we don't explicitly check for that.
item_name: str
The name of the "thing" described by the given axis in these
DataFrames. In practice, this is either "sample" or "feature".
df_name: str
The name of the DataFrame represented by df_old and df_new.
filter_basis_name: str
The name of the other DataFrame which caused these items to be
dropped. For example, if we're checking to see if samples were
dropped from the sample metadata file due to to samples not being
in the table, df_name could be "sample metadata file" and
filter_basis_name could be "table".
References
----------
This function was adapted from Qurro's source code:
https://github.com/biocore/qurro/blob/b9613534b2125c2e7ee22e79fdff311812f4fefe/qurro/_df_utils.py#L203
"""

dropped_item_ct = df_old.shape[axis_num] - df_new.shape[axis_num]
if dropped_item_ct > 0:
print(
"{} {}(s) in the {} were not present in the {}.".format(
dropped_item_ct, item_name, df_name, filter_basis_name
)
)
print(
"These {}(s) have been removed from the "
"visualization.".format(item_name)
)


def match_inputs(tree, table, sample_metadata, feature_metadata=None):
"""Matches various input sources.
Parameters
----------
tree: empress.tree.Tree
The tree to be visualized.
table: pd.DataFrame
Representation of the feature table (containing features' abundances in
samples).
sample_metadata: pd.DataFrame
Sample metadata. The index should describe sample IDs; the columns
should describe different sample metadata fields' names.
feature_metadata: pd.DataFrame or None
Feature metadata. If this is passed, the index should describe feature
IDs and the columns should describe different feature metadata fields'
names.
Returns
-------
(table, sample_metadata): (pd.DataFrame, pd.DataFrame)
Versions of the input table and sample metadata filtered such that:
-The table only contains samples also present in the sample
metadata.
-The table only contains features also present in the tree.
-The sample metadata only contains samples also present in the
table.
Raises
------
DataMatchingError
If any of the following conditions are met:
-No features are shared between the tree and table.
-No samples are shared between the sample metadata and table.
References
----------
This function was based on match_table_and_data() in Qurro's code:
https://github.com/biocore/qurro/blob/b9613534b2125c2e7ee22e79fdff311812f4fefe/qurro/_df_utils.py#L255
"""
# Match table and tree
# NOTE: This may be slow for huge trees / tables, could likely be optimized
tree_node_names = [n.name for n in tree.preorder()]
tree_and_table_features = set(tree_node_names) & set(table.index)

if len(tree_and_table_features) == 0:
raise DataMatchingError(
"No features are shared between the tree's nodes and the feature "
"table."
)

# Filter table to just features that are also present in the tree
# Note that we *don't* filter the tree analogously, because we want to draw
# the whole tree (that being said the Empress UI supports just showing
# features in the table, anyway)
ff_table = table.loc[tree_and_table_features]

# Report to user about any dropped samples from table
print_if_dropped(table, ff_table, 0, "feature", "table", "tree")

# Match table and sample metadata
sample_metadata_t = sample_metadata.T
sf_ff_table, sf_sample_metadata_t = ff_table.align(
sample_metadata_t, axis="columns", join="inner"
)
# At this point, the columns of f_table and f_sample_metadata_t should be
# filtered to just the shared samples.
sf_sample_metadata = sf_sample_metadata_t.T

# Check that at least 1 sample is shared between the s. metadata and table
if sf_sample_metadata.shape[0] < 1:
raise DataMatchingError(
"No samples are shared between the sample metadata file and the "
"feature table."
)
# Report to user about any dropped samples from s. metadata and/or table
print_if_dropped(
sample_metadata,
sf_sample_metadata,
0,
"sample",
"sample metadata file",
"table",
)
print_if_dropped(
table,
sf_ff_table,
1,
"sample",
"table",
"sample metadata file",
)
return sf_ff_table, sf_sample_metadata
60 changes: 60 additions & 0 deletions tests/python/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#
# ----------------------------------------------------------------------------
import unittest
import pandas as pd
from skbio import TreeNode
from empress import Tree
import empress.tools as tools
Expand All @@ -17,6 +18,28 @@ def mock_tree_from_nwk(self):

def setUp(self):
self.tree = self.mock_tree_from_nwk()
# Test table/metadata (mostly) adapted from Qurro:
# https://github.com/biocore/qurro/blob/b9613534b2125c2e7ee22e79fdff311812f4fefe/qurro/tests/test_df_utils.py#L178
self.table = pd.DataFrame(
{
"Sample1": [1, 2, 3, 4, 5, 6, 7, 8],
"Sample2": [8, 7, 6, 5, 4, 3, 2, 1],
"Sample3": [1, 0, 0, 0, 0, 0, 0, 0],
"Sample4": [0, 0, 0, 1, 0, 0, 0, 0]
},
index=["a", "c", "e", "d", "b", "x", "y", "z"]
)
self.sample_metadata = pd.DataFrame(
{
"Metadata1": [0, 0, 0, 1],
"Metadata2": [0, 0, 0, 0],
"Metadata3": [1, 2, 3, 4],
"Metadata4": ["abc", "def", "ghi", "jkl"]
},
index=list(self.table.columns)[:]
)
# TODO Also test matching feature metadata, when that's supported
self.feature_metadata = None

def test_name_internal_nodes(self):
t = Tree.from_tree(self.tree)
Expand All @@ -26,6 +49,43 @@ def test_name_internal_nodes(self):
for i, node in enumerate(t.postorder()):
self.assertEqual(node.name, names[i])

def test_match_inputs_basic(self):
"""Tests the basic case where no samples are dropped, but some features
are present in the table but not the tree.
"""
t = Tree.from_tree(self.tree)
tools.name_internal_nodes(t)
filtered_tbl, filtered_sample_metadata = tools.match_inputs(
t, self.table, self.sample_metadata
)
# No samples should've been dropped with this example data.
self.assertCountEqual(filtered_tbl.columns, self.table.columns)
self.assertCountEqual(
filtered_sample_metadata.index, self.sample_metadata.index
)
# Just for the sake of sanity, make sure we didn't accidentally drop
# any sample metadata columns
self.assertCountEqual(
filtered_sample_metadata.columns, self.sample_metadata.columns
)
# Some features should've been dropped from the table:
# "a", "b", "e", and "d" are the only features present in both the
# table and tree.
self.assertCountEqual(filtered_tbl.index, ["a", "b", "e", "d"])
# TODO: ensure that dropped-feature message is printed

def test_match_inputs_no_shared_samples(self):
t = Tree.from_tree(self.tree)
tools.name_internal_nodes(t)
bad_sample_metadata = self.sample_metadata.copy()
bad_sample_metadata.index = ["lol", "nothing", "here", "matches"]
with self.assertRaisesRegex(
tools.DataMatchingError,
"No samples are shared between the sample metadata file and the "
"feature table."
):
tools.match_inputs(t, self.table, bad_sample_metadata)


if __name__ == "__main__":
unittest.main()

0 comments on commit 1619a80

Please sign in to comment.