Skip to content

Commit

Permalink
Merge branch 'excel-engine' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
castillohair committed Jun 15, 2020
2 parents 1f4ca32 + ed91526 commit 9af6468
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 9 deletions.
61 changes: 52 additions & 9 deletions FlowCal/excel_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
try:
import openpyxl
except ImportError:
pass

import FlowCal.io
import FlowCal.plot
Expand All @@ -115,7 +119,7 @@ class ExcelUIException(Exception):
"""
pass

def read_table(filename, sheetname, index_col=None):
def read_table(filename, sheetname, index_col=None, engine=None):
"""
Return the contents of an Excel table as a pandas DataFrame.
Expand All @@ -128,6 +132,9 @@ def read_table(filename, sheetname, index_col=None):
index_col : str, optional
Column name or index to be used as row labels of the DataFrame. If
None, default index will be used.
engine : str, optional
Engine used by `pd.read_excel()` to read Excel file. If None, try
'openpyxl' then 'xlrd'.
Returns
-------
Expand All @@ -150,17 +157,53 @@ def read_table(filename, sheetname, index_col=None):
raise TypeError("sheetname should specify a single sheet")

# Load excel table using pandas
# Parameter specifying sheet name is slightly different depending on pandas'
# version.
read_excel_kwargs = {'io':filename,'index_col':index_col}

# Parameter specifying sheet name depends on pandas version
if packaging.version.parse(pd.__version__) \
< packaging.version.parse('0.21'):
table = pd.read_excel(filename,
sheetname=sheetname,
index_col=index_col)
read_excel_kwargs['sheetname'] = sheetname
else:
table = pd.read_excel(filename,
sheet_name=sheetname,
index_col=index_col)
read_excel_kwargs['sheet_name'] = sheetname

if engine is None:
# try reading Excel file using openpyxl engine first, then xlrd
try:
read_excel_kwargs['engine'] = 'openpyxl'
table = pd.read_excel(**read_excel_kwargs)
except ImportError as e:
if not('openpyxl' in str(e).lower()
and 'missing' in str(e).lower()):
raise
else:
# pandas recognizes openpyxl but package is missing, try xlrd
read_excel_kwargs['engine'] = 'xlrd'
table = pd.read_excel(**read_excel_kwargs)
except ValueError as e:
if not('openpyxl' in str(e).lower()
and 'unknown' in str(e).lower()):
raise
else:
# pandas does not recognize openpyxl (e.g. pandas
# version <= 0.25.0), try xlrd
read_excel_kwargs['engine'] = 'xlrd'
table = pd.read_excel(**read_excel_kwargs)
except Exception as e:
if 'openpyxl' in sys.modules \
and isinstance(e, openpyxl.utils.exceptions \
.InvalidFileException):
# unsupported file type (e.g. .xls), try xlrd
#
# (note: openpyxl's InvalidFileException has been stable at
# that location since v2.2.0)
read_excel_kwargs['engine'] = 'xlrd'
table = pd.read_excel(**read_excel_kwargs)
else:
raise
else:
read_excel_kwargs['engine'] = engine
table = pd.read_excel(**read_excel_kwargs)

# Eliminate rows whose index are null
if index_col is not None:
table = table[pd.notnull(table.index)]
Expand Down
1 change: 1 addition & 0 deletions doc/getting_started/install_python.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Alternatively, download ``FlowCal`` from `here <https://github.com/taborlab/Flow
* ``scikit-learn`` (>=0.16.0)
* ``pandas`` (>=0.16.1)
* ``xlrd`` (>=0.9.2)
* ``openpyxl`` (>=2.4.1)
* ``XlsxWriter`` (>=0.5.2)

If you have ``pip``, a ``requirements.txt`` file is provided, such that the required packages can be installed by running::
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ scikit-image>=0.10.0
scikit-learn>=0.16.0
pandas>=0.16.1
xlrd>=0.9.2
openpyxl>=2.4.1
XlsxWriter>=0.5.2
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def find_version(file_path):
'scikit-learn>=0.16.0',
'pandas>=0.16.1',
'xlrd>=0.9.2',
'openpyxl>=2.4.1',
'XlsxWriter>=0.5.2'],

# List additional groups of dependencies here (e.g. development
Expand Down
47 changes: 47 additions & 0 deletions test/test_excel_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,53 @@ def test_read_table(self):
# Compare
tm.assert_frame_equal(table, expected_output)

def test_read_table_xls(self):
"""
Test for proper loading of a table from an old-format Excel sheet.
"""
xls_filename = 'test/test_excel_ui.xls'

# Sheet to read
sheetname = "Instruments"
# Column to use as index labels
index_col = "ID"

# Expected output
expected_output_list = []
row = {}
row[u'Description'] = u'Moake\'s Flow Cytometer'
row[u'Forward Scatter Channel'] = u'FSC-H'
row[u'Side Scatter Channel'] = u'SSC-H'
row[u'Fluorescence Channels'] = u'FL1-H, FL2-H, FL3-H'
row[u'Time Channel'] = u'Time'
expected_output_list.append(row)
row = {}
row[u'Description'] = u'Moake\'s Flow Cytometer (new acquisition card)'
row[u'Forward Scatter Channel'] = u'FSC'
row[u'Side Scatter Channel'] = u'SSC'
row[u'Fluorescence Channels'] = u'FL1, FL2, FL3'
row[u'Time Channel'] = u'TIME'
expected_output_list.append(row)
expected_index = pd.Series([u'FC001', u'FC002'], name='ID')
expected_columns = [u'Description',
u'Forward Scatter Channel',
u'Side Scatter Channel',
u'Fluorescence Channels',
u'Time Channel']

expected_output = pd.DataFrame(expected_output_list,
index=expected_index,
columns=expected_columns)

# Read table
table = FlowCal.excel_ui.read_table(xls_filename,
sheetname=sheetname,
index_col=index_col)

# Compare
tm.assert_frame_equal(table, expected_output)

def test_read_table_no_index_col(self):
"""
Test proper loading of a table when no index column is specified.
Expand Down
Binary file added test/test_excel_ui.xls
Binary file not shown.

0 comments on commit 9af6468

Please sign in to comment.