Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate pandas read_excel engine from xlrd to openpyxl. #325

Merged
merged 2 commits into from
Jun 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 52 additions & 9 deletions FlowCal/excel_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
try:
import openpyxl
except ImportError:
pass

import FlowCal.io
import FlowCal.plot
Expand All @@ -115,7 +119,7 @@ class ExcelUIException(Exception):
"""
pass

def read_table(filename, sheetname, index_col=None):
def read_table(filename, sheetname, index_col=None, engine=None):
"""
Return the contents of an Excel table as a pandas DataFrame.
Expand All @@ -128,6 +132,9 @@ def read_table(filename, sheetname, index_col=None):
index_col : str, optional
Column name or index to be used as row labels of the DataFrame. If
None, default index will be used.
engine : str, optional
Engine used by `pd.read_excel()` to read Excel file. If None, try
'openpyxl' then 'xlrd'.
Returns
-------
Expand All @@ -150,17 +157,53 @@ def read_table(filename, sheetname, index_col=None):
raise TypeError("sheetname should specify a single sheet")

# Load excel table using pandas
# Parameter specifying sheet name is slightly different depending on pandas'
# version.
read_excel_kwargs = {'io':filename,'index_col':index_col}

# Parameter specifying sheet name depends on pandas version
if packaging.version.parse(pd.__version__) \
< packaging.version.parse('0.21'):
table = pd.read_excel(filename,
sheetname=sheetname,
index_col=index_col)
read_excel_kwargs['sheetname'] = sheetname
else:
table = pd.read_excel(filename,
sheet_name=sheetname,
index_col=index_col)
read_excel_kwargs['sheet_name'] = sheetname

if engine is None:
# try reading Excel file using openpyxl engine first, then xlrd
try:
read_excel_kwargs['engine'] = 'openpyxl'
table = pd.read_excel(**read_excel_kwargs)
except ImportError as e:
if not('openpyxl' in str(e).lower()
and 'missing' in str(e).lower()):
raise
else:
# pandas recognizes openpyxl but package is missing, try xlrd
read_excel_kwargs['engine'] = 'xlrd'
table = pd.read_excel(**read_excel_kwargs)
except ValueError as e:
if not('openpyxl' in str(e).lower()
and 'unknown' in str(e).lower()):
raise
else:
# pandas does not recognize openpyxl (e.g. pandas
# version <= 0.25.0), try xlrd
read_excel_kwargs['engine'] = 'xlrd'
table = pd.read_excel(**read_excel_kwargs)
except Exception as e:
if 'openpyxl' in sys.modules \
and isinstance(e, openpyxl.utils.exceptions \
.InvalidFileException):
# unsupported file type (e.g. .xls), try xlrd
#
# (note: openpyxl's InvalidFileException has been stable at
# that location since v2.2.0)
read_excel_kwargs['engine'] = 'xlrd'
table = pd.read_excel(**read_excel_kwargs)
else:
raise
else:
read_excel_kwargs['engine'] = engine
table = pd.read_excel(**read_excel_kwargs)

# Eliminate rows whose index are null
if index_col is not None:
table = table[pd.notnull(table.index)]
Expand Down
3 changes: 2 additions & 1 deletion doc/getting_started/install_python.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Alternatively, download ``FlowCal`` from `here <https://github.com/taborlab/Flow
* ``scikit-learn`` (>=0.16.0)
* ``pandas`` (>=0.16.1)
* ``xlrd`` (>=0.9.2)
* ``openpyxl`` (>=2.4.1)
* ``XlsxWriter`` (>=0.5.2)

If you have ``pip``, a ``requirements.txt`` file is provided, such that the required packages can be installed by running::
Expand Down Expand Up @@ -52,4 +53,4 @@ Again, some users may need to precede the previous commands with ``sudo``.

sudo pip install --upgrade pip

After this, you may install ``FlowCal`` by following the steps above.
After this, you may install ``FlowCal`` by following the steps above.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ scikit-image>=0.10.0
scikit-learn>=0.16.0
pandas>=0.16.1
xlrd>=0.9.2
openpyxl>=2.4.1
XlsxWriter>=0.5.2
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def find_version(file_path):
'scikit-learn>=0.16.0',
'pandas>=0.16.1',
'xlrd>=0.9.2',
'openpyxl>=2.4.1',
'XlsxWriter>=0.5.2'],

# List additional groups of dependencies here (e.g. development
Expand Down
47 changes: 47 additions & 0 deletions test/test_excel_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,53 @@ def test_read_table(self):
# Compare
tm.assert_frame_equal(table, expected_output)

def test_read_table_xls(self):
"""
Test for proper loading of a table from an old-format Excel sheet.
"""
xls_filename = 'test/test_excel_ui.xls'

# Sheet to read
sheetname = "Instruments"
# Column to use as index labels
index_col = "ID"

# Expected output
expected_output_list = []
row = {}
row[u'Description'] = u'Moake\'s Flow Cytometer'
row[u'Forward Scatter Channel'] = u'FSC-H'
row[u'Side Scatter Channel'] = u'SSC-H'
row[u'Fluorescence Channels'] = u'FL1-H, FL2-H, FL3-H'
row[u'Time Channel'] = u'Time'
expected_output_list.append(row)
row = {}
row[u'Description'] = u'Moake\'s Flow Cytometer (new acquisition card)'
row[u'Forward Scatter Channel'] = u'FSC'
row[u'Side Scatter Channel'] = u'SSC'
row[u'Fluorescence Channels'] = u'FL1, FL2, FL3'
row[u'Time Channel'] = u'TIME'
expected_output_list.append(row)
expected_index = pd.Series([u'FC001', u'FC002'], name='ID')
expected_columns = [u'Description',
u'Forward Scatter Channel',
u'Side Scatter Channel',
u'Fluorescence Channels',
u'Time Channel']

expected_output = pd.DataFrame(expected_output_list,
index=expected_index,
columns=expected_columns)

# Read table
table = FlowCal.excel_ui.read_table(xls_filename,
sheetname=sheetname,
index_col=index_col)

# Compare
tm.assert_frame_equal(table, expected_output)

def test_read_table_no_index_col(self):
"""
Test proper loading of a table when no index column is specified.
Expand Down
Binary file added test/test_excel_ui.xls
Binary file not shown.