From cfcf01265b5225eea5b5f1b614960770fb865714 Mon Sep 17 00:00:00 2001 From: "John T. Sexton" Date: Tue, 5 May 2020 15:25:58 -0500 Subject: [PATCH] Migrate pandas read_excel engine from xlrd to openpyxl. -Modify FlowCal.excel_ui.read_table() to first try openpyxl engine when reading an Excel file and then xlrd (which is the only package that can read old-style XLS files). -Expose the pd.read_excel() `engine` parameter to the user from FlowCal.excel_ui.read_table(). -Add new unit test that reads old-style XLS file. -Modify requirements file to require openpyxl package. --- FlowCal/excel_ui.py | 61 +++++++++++++++++++++++++++++++++++------ requirements.txt | 1 + test/test_excel_ui.py | 47 +++++++++++++++++++++++++++++++ test/test_excel_ui.xls | Bin 0 -> 25088 bytes 4 files changed, 100 insertions(+), 9 deletions(-) create mode 100644 test/test_excel_ui.xls diff --git a/FlowCal/excel_ui.py b/FlowCal/excel_ui.py index 3408760..fb01990 100644 --- a/FlowCal/excel_ui.py +++ b/FlowCal/excel_ui.py @@ -96,6 +96,10 @@ from matplotlib import pyplot as plt import numpy as np import pandas as pd +try: + import openpyxl +except ImportError: + pass import FlowCal.io import FlowCal.plot @@ -115,7 +119,7 @@ class ExcelUIException(Exception): """ pass -def read_table(filename, sheetname, index_col=None): +def read_table(filename, sheetname, index_col=None, engine=None): """ Return the contents of an Excel table as a pandas DataFrame. @@ -128,6 +132,9 @@ def read_table(filename, sheetname, index_col=None): index_col : str, optional Column name or index to be used as row labels of the DataFrame. If None, default index will be used. + engine : str, optional + Engine used by `pd.read_excel()` to read Excel file. If None, try + 'openpyxl' then 'xlrd'. Returns ------- @@ -150,17 +157,53 @@ def read_table(filename, sheetname, index_col=None): raise TypeError("sheetname should specify a single sheet") # Load excel table using pandas - # Parameter specifying sheet name is slightly different depending on pandas' - # version. + read_excel_kwargs = {'io':filename,'index_col':index_col} + + # Parameter specifying sheet name depends on pandas version if packaging.version.parse(pd.__version__) \ < packaging.version.parse('0.21'): - table = pd.read_excel(filename, - sheetname=sheetname, - index_col=index_col) + read_excel_kwargs['sheetname'] = sheetname else: - table = pd.read_excel(filename, - sheet_name=sheetname, - index_col=index_col) + read_excel_kwargs['sheet_name'] = sheetname + + if engine is None: + # try reading Excel file using openpyxl engine first, then xlrd + try: + read_excel_kwargs['engine'] = 'openpyxl' + table = pd.read_excel(**read_excel_kwargs) + except ImportError as e: + if not('openpyxl' in str(e).lower() + and 'missing' in str(e).lower()): + raise + else: + # pandas recognizes openpyxl but package is missing, try xlrd + read_excel_kwargs['engine'] = 'xlrd' + table = pd.read_excel(**read_excel_kwargs) + except ValueError as e: + if not('openpyxl' in str(e).lower() + and 'unknown' in str(e).lower()): + raise + else: + # pandas does not recognize openpyxl (e.g. pandas + # version <= 0.25.0), try xlrd + read_excel_kwargs['engine'] = 'xlrd' + table = pd.read_excel(**read_excel_kwargs) + except Exception as e: + if 'openpyxl' in sys.modules \ + and isinstance(e, openpyxl.utils.exceptions \ + .InvalidFileException): + # unsupported file type (e.g. .xls), try xlrd + # + # (note: openpyxl's InvalidFileException has been stable at + # that location since v2.2.0) + read_excel_kwargs['engine'] = 'xlrd' + table = pd.read_excel(**read_excel_kwargs) + else: + raise + else: + read_excel_kwargs['engine'] = engine + table = pd.read_excel(**read_excel_kwargs) + # Eliminate rows whose index are null if index_col is not None: table = table[pd.notnull(table.index)] diff --git a/requirements.txt b/requirements.txt index f23e2e7..b259aee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ scikit-image>=0.10.0 scikit-learn>=0.16.0 pandas>=0.16.1 xlrd>=0.9.2 +openpyxl>=2.4.1 XlsxWriter>=0.5.2 diff --git a/test/test_excel_ui.py b/test/test_excel_ui.py index aba240f..a242eac 100644 --- a/test/test_excel_ui.py +++ b/test/test_excel_ui.py @@ -67,6 +67,53 @@ def test_read_table(self): # Compare tm.assert_frame_equal(table, expected_output) + def test_read_table_xls(self): + """ + Test for proper loading of a table from an old-format Excel sheet. + + """ + xls_filename = 'test/test_excel_ui.xls' + + # Sheet to read + sheetname = "Instruments" + # Column to use as index labels + index_col = "ID" + + # Expected output + expected_output_list = [] + row = {} + row[u'Description'] = u'Moake\'s Flow Cytometer' + row[u'Forward Scatter Channel'] = u'FSC-H' + row[u'Side Scatter Channel'] = u'SSC-H' + row[u'Fluorescence Channels'] = u'FL1-H, FL2-H, FL3-H' + row[u'Time Channel'] = u'Time' + expected_output_list.append(row) + row = {} + row[u'Description'] = u'Moake\'s Flow Cytometer (new acquisition card)' + row[u'Forward Scatter Channel'] = u'FSC' + row[u'Side Scatter Channel'] = u'SSC' + row[u'Fluorescence Channels'] = u'FL1, FL2, FL3' + row[u'Time Channel'] = u'TIME' + expected_output_list.append(row) + expected_index = pd.Series([u'FC001', u'FC002'], name='ID') + expected_columns = [u'Description', + u'Forward Scatter Channel', + u'Side Scatter Channel', + u'Fluorescence Channels', + u'Time Channel'] + + expected_output = pd.DataFrame(expected_output_list, + index=expected_index, + columns=expected_columns) + + # Read table + table = FlowCal.excel_ui.read_table(xls_filename, + sheetname=sheetname, + index_col=index_col) + + # Compare + tm.assert_frame_equal(table, expected_output) + def test_read_table_no_index_col(self): """ Test proper loading of a table when no index column is specified. diff --git a/test/test_excel_ui.xls b/test/test_excel_ui.xls new file mode 100644 index 0000000000000000000000000000000000000000..8169fc33955a84f675e9f6b4ef86e50b6dfcfcf8 GIT binary patch literal 25088 zcmeHPeQX@Zb$@$zB#-(csc(NsT9Nu7DN>Q{WLb)2ohaI}lu(usDUjPJl~2c`MTp`l z$D<_y7ECAcKe3xSR4TWYS|?46pv|X0TBnFx%RpiOLpMbm*Ri7{PT?8}QZ%vBqD^4u z^83x)-tF$~?%hR35Cgl!?C!js`Mo!9-g`SUJI8#UiuTbv^Du3Xk`Y23q6j$mS;b!~HL{JTDPxj*I*V*Kfp_6ZsFMBXWxO z=I`>@rDRAOl;JWA?-rw342emTT*8wMxlTzpU!51I^Ve0`XVv)@nJ+(=QF(dm2UcuB zJwduNvQw4&V|DIS=Xp3A@{FqUnxbr!<#Gk(it$uY^ING!ANVko+)YkSEkyh}!8cox6_dQC@-l=R8_oy>F#ocxq%HkPSWh3RHl z2S2GWwd$AoT9p;+O3l}0A-qE5e~YHN>eFR2q*D{`z8(xQwTgL;Xnc-dl>VPCJ&HZ1 z^j}T?st^5deCS{EphyRoh&R_b_86|7=IcBeNfwEwKx#d`Fo+GVqpx^#+nx2yKsBGg)_&Zm*xC&NpGd{mpVEY z&t42YKdR{Stn$Fude-`H#(UXI&n;GYmz)bN`gbtIlO1I}%(v*S^3=mX)Wo@TAkN$y zUnuudq2(hbgD;-^z`1z8oO94wAR=?|UWnq3!%MzA@!IK~hc|Vkx&n(dR6!3F8{j}e z8sZRZv;tMu5C_KQhAOag8>+xgX{Z9-qM-`(yM`(-&@@zm9@9_-x@1EY=wl64U{GwR z!X`%rnqm!q*wJW(b&d*kcwAqj75W>kz(taVw1nYnV->bJD!A2ZoQn^*;u1!;h)7n{ za94=LO96|nuSephSq*|AeQl6pDi|bL4@e>z46>mN!f=SdUUZmM49($#7k>87MgQh> zJ3uZ;vo}cb#UPNIogjyOBq{#%l1-A&xO(R%fV8OIq136=JwB4m{COFqrPRdx?z^vO zE!wab31FwS+Gu#Bj#(bGb8)v&TT#|7RLWbZt5>g936)Bvs)VxBETKp%$t0Fgc3QPi zm>SxI`qeofb$bbA*UGhN%I#d6A(VD5YQIV-i&=5XC~K;^WxeH=(K~r~n)Kt3KVAm0 zY{0Hsi6%km;`z z&dTc0NiYZ&(rSau3Sz1cO{x56H|=FbF21wIxX;gF)6fK#=j)31>t8I^hht z4xI#pV4hzaWM(KB#Hi|KOs*s$8{HwQAA?08}0bbOyoQ+1`oh%GP9#WMW{SuLjYBMnFfe_0uq0++*~ z07K?TODsEb<;}N>O}yvZ2BU!0T+ASw4mX=bHJi==HhM~-*feEFu3q_`H=FrxHpyx> zYXaElNrz(7oE`b;TR-q-v%t+}Q#G6305+WMmXy|#9eLx2|K-hQp_|Q3)oj)Wu;Fx} z#3r5{dF{V{;mu}|o6Y8GHi-Z>dRVKJ)|wr8=O?dvvsp|wn}RKE1=wt=!_sDkzB%)T zH=8AHHXfGN6~M-BX){A_e)Ab`HcQ=XJS=T(02{ld%?y3_jc2{tT<2!vVQK3E*w`&? zW@zT}Yu;?GceC-ZwD$$Dv0K{A&@(^yu{WC=+-y87Eg8VZZfP?^Z@>JsH=7&DW^=Hm znP9WI4ogcU|MaO>yxA;sv+=OBRRL`5mX=7qcHy($Y?iy(cvxC@02{ldC6fR2%0+KB zE8J{6EUhnqjos1`$tOPdqBk4U&BnvhHUzM-TUsJH{n8b0HY?q1JS=Tf02{ldC6d2- z?oYhf%)xhVjJwM3%>mbYb6K&)_gyKn|Nei_pnOr{D7`D^NLK|E7Uc$A=>oM|5x$`u z*yjzpssal8af7aQf!b|GL^#i~Qe(-Cm$^>BietcBO}><6E6QZ`@Ff$}TK74wv7+ zOv-|qyBEH2I#z4s_1lh}cj#?Wh2 zau|Yf9fj{+f^s>_V!-7k#h_X6NeK_Ro?)P{Jr{T6S{=Fbkv444z~_s>6|5%&RJ6F-1$gM z)=fo=({a-&N>da}2BDIZ_)CM82+Q_OHKS~|t86p2Ggiu)@)5iVK3cBRKNUlrHLg0b zVjOiQvGFUf#=aRjTduKwss%OHyK1x)=Ty~j@>+yVlvt*@?aZmM^mw|E%bzt5WzQ6* zlq=1A%D!qY(6N2&m@> z$^_srEB2n#a5TjhNTvhf`N%v>GHl?8eQxj$4{#m-e87vTFFnUGJi(K8aKu12_)Ks3$JhpBTJn{h9^&rjb_q?>~PPc@ul*pO@8eH`b*3gun+tTwr50o z2rgmZ?|CDRB$D~QA#clwu?x?*9FEUEdv8UsxHUVG$&a4mAO_^!x%}yLe%L$+riE)(aa2_>Ve(vJ~-IF2iJqV!s*F>0mwre`}g#j zyN43`lGG3P?z;_v8g+0+4NNjoP!;2eOZB@4H*DBgx+Y53B$j~>LP9fFI1U|*Q09y! zGs}SY0Irl_pu~JYGgp^n|3DGjqvEn1`vssQ`8lqaaovgURqgDH_+lgwN(U6g0ERxf zfS$Zyd$Q|44Y>%~MH`Jh{}5;o*=Pjfz7_q8fOf|(0%k|heWe3>kZ!!)zl)dfT<~Kw zjKJ30)O`oekPwo;{nu~(>%LA@2ID{?b4CE-QeilKFD%H6ta$J7axfpT?7^PejaGwbbzsBrVkG`D3FwDOGz++c`x&L_kmp>Dib!(q`l@rrHL*lrM&vzox zzxN<P#m1}bsakB$x1_Z(U78_nc%6SE z;vBw!#CiX{NL&f#Isn%L9zx>!!1s~7)&umMzFlFVi0^edn6NCdY@fyP1>BLhe9`$>-}w1WpL2cg(d2ZP^K8!Fx%ZOue$Mf^W0T+f zInU=#Fz)~2T%YR!oQHG1&42&Ee|NzD2*dw?!v7b)0#_4hB~m9+7t$)E)ku847T0d1 z9;9BRbx3_k{YZTN4ic~7acvs-5Pp@wA6l6Br5XRV1AkrC$gxW-X z$wW%!_m;YJ^|>4T=|l9NdiisCNjVQL*!vA`{C($1{9=SX(3m`1ev9_6vGM;6F}|&h literal 0 HcmV?d00001