From ca8e2c4a7350c315f857e3aee4c84920129823a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yoshiki=20V=C3=A1zquez=20Baeza?= <yoshiki89@gmail.com>
Date: Thu, 13 Aug 2020 08:48:58 -0700
Subject: [PATCH 1/3] PERF: Speed-up taxonomic string preprocessing

The string operations are vectorized which makes processing the EMP
taxonomy strings go from 45 seconds to ~1.5 seconds.
---
 empress/taxonomy_utils.py | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/empress/taxonomy_utils.py b/empress/taxonomy_utils.py
index 5707d43c1..ed4046307 100644
--- a/empress/taxonomy_utils.py
+++ b/empress/taxonomy_utils.py
@@ -100,11 +100,19 @@ def split_taxonomy(feature_metadata):
                 "(case insensitive)."
             )
 
-        # Find the maximum number of ;s within any of the taxonomy annotations
-        # Thanks Yoshiki for showing how to do this concisely :)
-        max_sc_count = feature_metadata[tax_col_name].str.count(";").max()
-
-        if max_sc_count == 0:
+        # Split the single column of taxonomy strings into n columns, where n
+        # is the highest number of taxonomic levels in any string. We have to
+        # account for leading and trailing whitespace (.str.strip()) as well as
+        # for whitespace between delimiters (.str.split()). The regular
+        # expression splits tokens separated by semicolons and up to one
+        # leading or trailing space. Importantly, using the expand keyword
+        # ensures that all rows are padded to have n tokens. The final step is
+        # to replace those empty values with "Unspecified".
+        tax_levels = feature_metadata[tax_col_name]\
+            .str.strip().str.split(r'\s{0,1};\s{0,1}', expand=True)
+        tax_levels.fillna('Unspecified', inplace=True)
+
+        if len(tax_levels.columns) == 1:
             # We allow this in the case of single-rank taxonomies (e.g. just
             # kingdoms, for some reason). Can change this to an error if
             # desired.
@@ -118,23 +126,9 @@ def split_taxonomy(feature_metadata):
                 TaxonomyWarning
             )
 
-        # OK, actually do splitting now (taking into account max_sc_count)
-        def split_taxonomy_col(fm_row):
-            levels = [r.strip() for r in fm_row.loc[tax_col_name].split(";")]
-            # If this row's taxonomy has less levels than the max number of
-            # levels, pad it out with empty strings.
-            if len(levels) < max_sc_count + 1:
-                num_missing_levels = (max_sc_count + 1) - len(levels)
-                levels += ["Unspecified"] * num_missing_levels
-            return levels
-
-        # Our use of result_type="expand" means that tax_levels will be a
-        # DataFrame with the same index as feature_metadata but with one column
-        # for each taxonomic level (in order -- Kingdom, Phylum, etc.)
-        tax_levels = feature_metadata.apply(
-            split_taxonomy_col, axis="columns", result_type="expand"
-        )
-        # Assign human-friendly column names: Level 1, Level 2, ...
+        # Our use of expand=True means that tax_levels will be a DataFrame with
+        # the same index as feature_metadata but with one column for each
+        # taxonomic level (in order -- Kingdom, Phylum, etc.)
         tax_levels.columns = [
             "Level {}".format(i) for i in range(1, len(tax_levels.columns) + 1)
         ]

From b9057492baac41e21736e6e440b1baffbd1afc93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yoshiki=20V=C3=A1zquez=20Baeza?= <yoshiki89@gmail.com>
Date: Thu, 13 Aug 2020 12:50:11 -0700
Subject: [PATCH 2/3] ENH: Catch all whitespace between delimiters

Thanks @fedarko
---
 empress/taxonomy_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/empress/taxonomy_utils.py b/empress/taxonomy_utils.py
index ed4046307..85feb5082 100644
--- a/empress/taxonomy_utils.py
+++ b/empress/taxonomy_utils.py
@@ -104,12 +104,12 @@ def split_taxonomy(feature_metadata):
         # is the highest number of taxonomic levels in any string. We have to
         # account for leading and trailing whitespace (.str.strip()) as well as
         # for whitespace between delimiters (.str.split()). The regular
-        # expression splits tokens separated by semicolons and up to one
-        # leading or trailing space. Importantly, using the expand keyword
-        # ensures that all rows are padded to have n tokens. The final step is
-        # to replace those empty values with "Unspecified".
+        # expression splits tokens separated by semicolons and any leading or
+        # trailing whitespace. Importantly, using the expand keyword ensures
+        # that all rows are padded to have n tokens. The final step is to
+        # replace those empty values with "Unspecified".
         tax_levels = feature_metadata[tax_col_name]\
-            .str.strip().str.split(r'\s{0,1};\s{0,1}', expand=True)
+            .str.strip().str.split(r'\s*;\s*', expand=True)
         tax_levels.fillna('Unspecified', inplace=True)
 
         if len(tax_levels.columns) == 1:

From a82126cc1b8c93c5f7681671ee4d6403e92a3f4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yoshiki=20V=C3=A1zquez=20Baeza?= <yoshiki89@gmail.com>
Date: Thu, 13 Aug 2020 13:02:25 -0700
Subject: [PATCH 3/3] TST: Add a case with a variable number of spaces

---
 tests/python/test_taxonomy_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/python/test_taxonomy_utils.py b/tests/python/test_taxonomy_utils.py
index 11f5f5aa0..703a2d7c1 100644
--- a/tests/python/test_taxonomy_utils.py
+++ b/tests/python/test_taxonomy_utils.py
@@ -27,8 +27,10 @@ def setUp(self):
                         "c__Gammaproteobacteria; o__Pasteurellales; "
                         "f__Pasteurellaceae; g__; s__"
                     ),
+                    # add a variable number of whitespace characters to check
+                    # these are all successfully removed
                     (
-                        "k__Bacteria; p__Bacteroidetes; c__Bacteroidia; "
+                        "k__Bacteria;p__Bacteroidetes  ;     c__Bacteroidia; "
                         "o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; "
                         "s__uniformis"
                     ),