Merge branch 'excel-engine' into develop

taborlab · Jun 15, 2020 · 9af6468 · 9af6468
2 parents 1f4ca32 + ed91526
commit 9af6468
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 9 deletions.
diff --git a/FlowCal/excel_ui.py b/FlowCal/excel_ui.py
@@ -96,6 +96,10 @@
 from matplotlib import pyplot as plt
 import numpy as np
 import pandas as pd
+try:
+    import openpyxl
+except ImportError:
+    pass
 
 import FlowCal.io
 import FlowCal.plot
@@ -115,7 +119,7 @@ class ExcelUIException(Exception):
     """
     pass
 
-def read_table(filename, sheetname, index_col=None):
+def read_table(filename, sheetname, index_col=None, engine=None):
     """
     Return the contents of an Excel table as a pandas DataFrame.
 
@@ -128,6 +132,9 @@ def read_table(filename, sheetname, index_col=None):
     index_col : str, optional
         Column name or index to be used as row labels of the DataFrame. If
         None, default index will be used.
+    engine : str, optional
+        Engine used by `pd.read_excel()` to read Excel file. If None, try
+        'openpyxl' then 'xlrd'.
 
     Returns
     -------
@@ -150,17 +157,53 @@ def read_table(filename, sheetname, index_col=None):
         raise TypeError("sheetname should specify a single sheet")
 
     # Load excel table using pandas
-    # Parameter specifying sheet name is slightly different depending on pandas'
-    # version.
+    read_excel_kwargs = {'io':filename,'index_col':index_col}
+
+    # Parameter specifying sheet name depends on pandas version
     if packaging.version.parse(pd.__version__) \
                 < packaging.version.parse('0.21'):
-        table = pd.read_excel(filename,
-                              sheetname=sheetname,
-                              index_col=index_col)
+        read_excel_kwargs['sheetname']  = sheetname
     else:
-        table = pd.read_excel(filename,
-                              sheet_name=sheetname,
-                              index_col=index_col)
+        read_excel_kwargs['sheet_name'] = sheetname
+
+    if engine is None:
+        # try reading Excel file using openpyxl engine first, then xlrd
+        try:
+            read_excel_kwargs['engine'] = 'openpyxl'
+            table = pd.read_excel(**read_excel_kwargs)
+        except ImportError as e:
+            if not('openpyxl' in str(e).lower()
+                   and 'missing' in str(e).lower()):
+                raise
+            else:
+                # pandas recognizes openpyxl but package is missing, try xlrd
+                read_excel_kwargs['engine'] = 'xlrd'
+                table = pd.read_excel(**read_excel_kwargs)
+        except ValueError as e:
+            if not('openpyxl' in str(e).lower()
+                   and 'unknown' in str(e).lower()):
+                raise
+            else:
+                # pandas does not recognize openpyxl (e.g. pandas
+                # version <= 0.25.0), try xlrd
+                read_excel_kwargs['engine'] = 'xlrd'
+                table = pd.read_excel(**read_excel_kwargs)
+        except Exception as e:
+            if 'openpyxl' in sys.modules \
+                    and isinstance(e, openpyxl.utils.exceptions \
+                                          .InvalidFileException):
+                # unsupported file type (e.g. .xls), try xlrd
+                #
+                # (note: openpyxl's InvalidFileException has been stable at
+                # that location since v2.2.0)
+                read_excel_kwargs['engine'] = 'xlrd'
+                table = pd.read_excel(**read_excel_kwargs)
+            else:
+                raise
+    else:
+        read_excel_kwargs['engine'] = engine
+        table = pd.read_excel(**read_excel_kwargs)
+
     # Eliminate rows whose index are null
     if index_col is not None:
         table = table[pd.notnull(table.index)]

diff --git a/doc/getting_started/install_python.rst b/doc/getting_started/install_python.rst
@@ -18,6 +18,7 @@ Alternatively, download ``FlowCal`` from `here <https://github.com/taborlab/Flow
 * ``scikit-learn`` (>=0.16.0)
 * ``pandas`` (>=0.16.1)
 * ``xlrd`` (>=0.9.2)
+* ``openpyxl`` (>=2.4.1)
 * ``XlsxWriter`` (>=0.5.2)
 
 If you have ``pip``, a ``requirements.txt`` file is provided, such that the required packages can be installed by running::

diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,5 @@ scikit-image>=0.10.0
 scikit-learn>=0.16.0
 pandas>=0.16.1
 xlrd>=0.9.2
+openpyxl>=2.4.1
 XlsxWriter>=0.5.2
diff --git a/setup.py b/setup.py
@@ -94,6 +94,7 @@ def find_version(file_path):
                       'scikit-learn>=0.16.0',
                       'pandas>=0.16.1',
                       'xlrd>=0.9.2',
+                      'openpyxl>=2.4.1',
                       'XlsxWriter>=0.5.2'],
 
     # List additional groups of dependencies here (e.g. development

diff --git a/test/test_excel_ui.py b/test/test_excel_ui.py
@@ -67,6 +67,53 @@ def test_read_table(self):
         # Compare
         tm.assert_frame_equal(table, expected_output)
 
+    def test_read_table_xls(self):
+        """
+        Test for proper loading of a table from an old-format Excel sheet.
+
+        """
+        xls_filename = 'test/test_excel_ui.xls'
+
+        # Sheet to read
+        sheetname = "Instruments"
+        # Column to use as index labels
+        index_col = "ID"
+
+        # Expected output
+        expected_output_list = []
+        row = {}
+        row[u'Description'] = u'Moake\'s Flow Cytometer'
+        row[u'Forward Scatter Channel'] = u'FSC-H'
+        row[u'Side Scatter Channel'] = u'SSC-H'
+        row[u'Fluorescence Channels'] = u'FL1-H, FL2-H, FL3-H'
+        row[u'Time Channel'] = u'Time'
+        expected_output_list.append(row)
+        row = {}
+        row[u'Description'] = u'Moake\'s Flow Cytometer (new acquisition card)'
+        row[u'Forward Scatter Channel'] = u'FSC'
+        row[u'Side Scatter Channel'] = u'SSC'
+        row[u'Fluorescence Channels'] = u'FL1, FL2, FL3'
+        row[u'Time Channel'] = u'TIME'
+        expected_output_list.append(row)
+        expected_index = pd.Series([u'FC001', u'FC002'], name='ID')
+        expected_columns = [u'Description',
+                            u'Forward Scatter Channel',
+                            u'Side Scatter Channel',
+                            u'Fluorescence Channels',
+                            u'Time Channel']
+
+        expected_output = pd.DataFrame(expected_output_list,
+                                       index=expected_index,
+                                       columns=expected_columns)
+
+        # Read table
+        table = FlowCal.excel_ui.read_table(xls_filename,
+                                            sheetname=sheetname,
+                                            index_col=index_col)
+
+        # Compare
+        tm.assert_frame_equal(table, expected_output)
+
     def test_read_table_no_index_col(self):
         """
         Test proper loading of a table when no index column is specified.

diff --git a/test/test_excel_ui.xls b/test/test_excel_ui.xls