From ca8e2c4a7350c315f857e3aee4c84920129823a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yoshiki=20V=C3=A1zquez=20Baeza?= Date: Thu, 13 Aug 2020 08:48:58 -0700 Subject: [PATCH 1/3] PERF: Speed-up taxonomic string preprocessing The string operations are vectorized which makes processing the EMP taxonomy strings go from 45 seconds to ~1.5 seconds. --- empress/taxonomy_utils.py | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/empress/taxonomy_utils.py b/empress/taxonomy_utils.py index 5707d43c1..ed4046307 100644 --- a/empress/taxonomy_utils.py +++ b/empress/taxonomy_utils.py @@ -100,11 +100,19 @@ def split_taxonomy(feature_metadata): "(case insensitive)." ) - # Find the maximum number of ;s within any of the taxonomy annotations - # Thanks Yoshiki for showing how to do this concisely :) - max_sc_count = feature_metadata[tax_col_name].str.count(";").max() - - if max_sc_count == 0: + # Split the single column of taxonomy strings into n columns, where n + # is the highest number of taxonomic levels in any string. We have to + # account for leading and trailing whitespace (.str.strip()) as well as + # for whitespace between delimiters (.str.split()). The regular + # expression splits tokens separated by semicolons and up to one + # leading or trailing space. Importantly, using the expand keyword + # ensures that all rows are padded to have n tokens. The final step is + # to replace those empty values with "Unspecified". + tax_levels = feature_metadata[tax_col_name]\ + .str.strip().str.split(r'\s{0,1};\s{0,1}', expand=True) + tax_levels.fillna('Unspecified', inplace=True) + + if len(tax_levels.columns) == 1: # We allow this in the case of single-rank taxonomies (e.g. just # kingdoms, for some reason). Can change this to an error if # desired. @@ -118,23 +126,9 @@ def split_taxonomy(feature_metadata): TaxonomyWarning ) - # OK, actually do splitting now (taking into account max_sc_count) - def split_taxonomy_col(fm_row): - levels = [r.strip() for r in fm_row.loc[tax_col_name].split(";")] - # If this row's taxonomy has less levels than the max number of - # levels, pad it out with empty strings. - if len(levels) < max_sc_count + 1: - num_missing_levels = (max_sc_count + 1) - len(levels) - levels += ["Unspecified"] * num_missing_levels - return levels - - # Our use of result_type="expand" means that tax_levels will be a - # DataFrame with the same index as feature_metadata but with one column - # for each taxonomic level (in order -- Kingdom, Phylum, etc.) - tax_levels = feature_metadata.apply( - split_taxonomy_col, axis="columns", result_type="expand" - ) - # Assign human-friendly column names: Level 1, Level 2, ... + # Our use of expand=True means that tax_levels will be a DataFrame with + # the same index as feature_metadata but with one column for each + # taxonomic level (in order -- Kingdom, Phylum, etc.) tax_levels.columns = [ "Level {}".format(i) for i in range(1, len(tax_levels.columns) + 1) ] From b9057492baac41e21736e6e440b1baffbd1afc93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yoshiki=20V=C3=A1zquez=20Baeza?= Date: Thu, 13 Aug 2020 12:50:11 -0700 Subject: [PATCH 2/3] ENH: Catch all whitespace between delimiters Thanks @fedarko --- empress/taxonomy_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/empress/taxonomy_utils.py b/empress/taxonomy_utils.py index ed4046307..85feb5082 100644 --- a/empress/taxonomy_utils.py +++ b/empress/taxonomy_utils.py @@ -104,12 +104,12 @@ def split_taxonomy(feature_metadata): # is the highest number of taxonomic levels in any string. We have to # account for leading and trailing whitespace (.str.strip()) as well as # for whitespace between delimiters (.str.split()). The regular - # expression splits tokens separated by semicolons and up to one - # leading or trailing space. Importantly, using the expand keyword - # ensures that all rows are padded to have n tokens. The final step is - # to replace those empty values with "Unspecified". + # expression splits tokens separated by semicolons and any leading or + # trailing whitespace. Importantly, using the expand keyword ensures + # that all rows are padded to have n tokens. The final step is to + # replace those empty values with "Unspecified". tax_levels = feature_metadata[tax_col_name]\ - .str.strip().str.split(r'\s{0,1};\s{0,1}', expand=True) + .str.strip().str.split(r'\s*;\s*', expand=True) tax_levels.fillna('Unspecified', inplace=True) if len(tax_levels.columns) == 1: From a82126cc1b8c93c5f7681671ee4d6403e92a3f4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yoshiki=20V=C3=A1zquez=20Baeza?= Date: Thu, 13 Aug 2020 13:02:25 -0700 Subject: [PATCH 3/3] TST: Add a case with a variable number of spaces --- tests/python/test_taxonomy_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/python/test_taxonomy_utils.py b/tests/python/test_taxonomy_utils.py index 11f5f5aa0..703a2d7c1 100644 --- a/tests/python/test_taxonomy_utils.py +++ b/tests/python/test_taxonomy_utils.py @@ -27,8 +27,10 @@ def setUp(self): "c__Gammaproteobacteria; o__Pasteurellales; " "f__Pasteurellaceae; g__; s__" ), + # add a variable number of whitespace characters to check + # these are all successfully removed ( - "k__Bacteria; p__Bacteroidetes; c__Bacteroidia; " + "k__Bacteria;p__Bacteroidetes ; c__Bacteroidia; " "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; " "s__uniformis" ),