diff --git a/empress/taxonomy_utils.py b/empress/taxonomy_utils.py index 5707d43c1..85feb5082 100644 --- a/empress/taxonomy_utils.py +++ b/empress/taxonomy_utils.py @@ -100,11 +100,19 @@ def split_taxonomy(feature_metadata): "(case insensitive)." ) - # Find the maximum number of ;s within any of the taxonomy annotations - # Thanks Yoshiki for showing how to do this concisely :) - max_sc_count = feature_metadata[tax_col_name].str.count(";").max() - - if max_sc_count == 0: + # Split the single column of taxonomy strings into n columns, where n + # is the highest number of taxonomic levels in any string. We have to + # account for leading and trailing whitespace (.str.strip()) as well as + # for whitespace between delimiters (.str.split()). The regular + # expression splits tokens separated by semicolons and any leading or + # trailing whitespace. Importantly, using the expand keyword ensures + # that all rows are padded to have n tokens. The final step is to + # replace those empty values with "Unspecified". + tax_levels = feature_metadata[tax_col_name]\ + .str.strip().str.split(r'\s*;\s*', expand=True) + tax_levels.fillna('Unspecified', inplace=True) + + if len(tax_levels.columns) == 1: # We allow this in the case of single-rank taxonomies (e.g. just # kingdoms, for some reason). Can change this to an error if # desired. @@ -118,23 +126,9 @@ def split_taxonomy(feature_metadata): TaxonomyWarning ) - # OK, actually do splitting now (taking into account max_sc_count) - def split_taxonomy_col(fm_row): - levels = [r.strip() for r in fm_row.loc[tax_col_name].split(";")] - # If this row's taxonomy has less levels than the max number of - # levels, pad it out with empty strings. - if len(levels) < max_sc_count + 1: - num_missing_levels = (max_sc_count + 1) - len(levels) - levels += ["Unspecified"] * num_missing_levels - return levels - - # Our use of result_type="expand" means that tax_levels will be a - # DataFrame with the same index as feature_metadata but with one column - # for each taxonomic level (in order -- Kingdom, Phylum, etc.) - tax_levels = feature_metadata.apply( - split_taxonomy_col, axis="columns", result_type="expand" - ) - # Assign human-friendly column names: Level 1, Level 2, ... + # Our use of expand=True means that tax_levels will be a DataFrame with + # the same index as feature_metadata but with one column for each + # taxonomic level (in order -- Kingdom, Phylum, etc.) tax_levels.columns = [ "Level {}".format(i) for i in range(1, len(tax_levels.columns) + 1) ] diff --git a/tests/python/test_taxonomy_utils.py b/tests/python/test_taxonomy_utils.py index 11f5f5aa0..703a2d7c1 100644 --- a/tests/python/test_taxonomy_utils.py +++ b/tests/python/test_taxonomy_utils.py @@ -27,8 +27,10 @@ def setUp(self): "c__Gammaproteobacteria; o__Pasteurellales; " "f__Pasteurellaceae; g__; s__" ), + # add a variable number of whitespace characters to check + # these are all successfully removed ( - "k__Bacteria; p__Bacteroidetes; c__Bacteroidia; " + "k__Bacteria;p__Bacteroidetes ; c__Bacteroidia; " "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; " "s__uniformis" ),