From 635015782dd00b5947923a98d97545d7f3f69a55 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 8 Jun 2019 15:35:29 -0400 Subject: [PATCH 1/4] Separated out ExcelFile tests --- pandas/tests/io/test_excel.py | 235 +++++++++++++++++++--------------- 1 file changed, 130 insertions(+), 105 deletions(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 6860afa710c4a..a85ee8a00d079 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -56,6 +56,16 @@ def ignore_xlrd_time_clock_warning(): yield +@pytest.fixture +def df_ref(): + """ + Obtain the reference data from read_csv with the Python engine. + """ + df_ref = read_csv('test1.csv', index_col=0, + parse_dates=True, engine='python') + return df_ref + + @pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) class TestReaders: @@ -74,15 +84,6 @@ def cd_and_set_engine(self, request, datapath, monkeypatch): monkeypatch.chdir(datapath("io", "data")) monkeypatch.setattr(pd, 'read_excel', func) - @pytest.fixture - def df_ref(self): - """ - Obtain the reference data from read_csv with the Python engine. - """ - df_ref = read_csv('test1.csv', index_col=0, - parse_dates=True, engine='python') - return df_ref - def test_usecols_int(self, ext, df_ref): df_ref = df_ref.reindex(columns=["A", "B", "C"]) @@ -239,75 +240,6 @@ def test_excel_cell_error_na(self, ext): expected = DataFrame([[np.nan]], columns=['Test']) tm.assert_frame_equal(parsed, expected) - def test_excel_passes_na(self, ext): - - excel = ExcelFile('test4' + ext) - - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) - expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) - expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - # 13967 - excel = ExcelFile('test5' + ext) - - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) - expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) - expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - @pytest.mark.parametrize('arg', ['sheet', 'sheetname', 'parse_cols']) - def test_unexpected_kwargs_raises(self, ext, arg): - # gh-17964 - excel = ExcelFile('test1' + ext) - - kwarg = {arg: 'Sheet1'} - msg = "unexpected keyword argument `{}`".format(arg) - with pytest.raises(TypeError, match=msg): - pd.read_excel(excel, **kwarg) - - def test_excel_table_sheet_by_index(self, ext, df_ref): - - excel = ExcelFile('test1' + ext) - - df1 = pd.read_excel(excel, 0, index_col=0) - df2 = pd.read_excel(excel, 1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) - - df1 = excel.parse(0, index_col=0) - df2 = excel.parse(1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) - - df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) - tm.assert_frame_equal(df3, df1.iloc[:-1]) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) - tm.assert_frame_equal(df3, df4) - - df3 = excel.parse(0, index_col=0, skipfooter=1) - tm.assert_frame_equal(df3, df1.iloc[:-1]) - - import xlrd - with pytest.raises(xlrd.XLRDError): - pd.read_excel(excel, 'asdf') - def test_excel_table(self, ext, df_ref): df1 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0) @@ -497,12 +429,7 @@ def test_date_conversion_overflow(self, ext): result = pd.read_excel('testdateoverflow' + ext) tm.assert_frame_equal(result, expected) - def test_sheet_name_and_sheetname(self, ext, df_ref): - # gh-10559: Minor improvement: Change "sheet_name" to "sheetname" - # gh-10969: DOC: Consistent var names (sheetname vs sheet_name) - # gh-12604: CLN GH10559 Rename sheetname variable to sheet_name - # gh-20920: ExcelFile.parse() and pd.read_xlsx() have different - # behavior for "sheetname" argument + def test_sheet_name(self, ext, df_ref): filename = "test1" sheet_name = "Sheet1" @@ -512,15 +439,8 @@ def test_sheet_name_and_sheetname(self, ext, df_ref): df2 = pd.read_excel(filename + ext, index_col=0, sheet_name=sheet_name) - excel = ExcelFile(filename + ext) - df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc - df2_parse = excel.parse(index_col=0, - sheet_name=sheet_name) - tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - tm.assert_frame_equal(df1_parse, df_ref, check_names=False) - tm.assert_frame_equal(df2_parse, df_ref, check_names=False) def test_excel_read_buffer(self, ext): @@ -530,11 +450,6 @@ def test_excel_read_buffer(self, ext): actual = pd.read_excel(f, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) - with open(pth, 'rb') as f: - xls = ExcelFile(f) - actual = pd.read_excel(xls, 'Sheet1', index_col=0) - tm.assert_frame_equal(expected, actual) - def test_bad_engine_raises(self, ext): bad_engine = 'foo' with pytest.raises(ValueError, match="Unknown engine: foo"): @@ -606,15 +521,6 @@ def test_read_from_py_localpath(self, ext): tm.assert_frame_equal(expected, actual) - def test_reader_closes_file(self, ext): - - f = open('test1' + ext, 'rb') - with ExcelFile(f) as xlsx: - # parses okay - pd.read_excel(xlsx, 'Sheet1', index_col=0) - - assert f.closed - def test_reader_seconds(self, ext): # Test reading times with and without milliseconds. GH5945. @@ -841,6 +747,125 @@ def test_read_excel_squeeze(self, ext): tm.assert_series_equal(actual, expected) +@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) +class TestExcelFileRead: + + @pytest.fixture(autouse=True, params=[ + # Add any engines to test here + pytest.param('xlrd', marks=pytest.mark.skipif( + not td.safe_import("xlrd"), reason="no xlrd")), + pytest.param(None, marks=pytest.mark.skipif( + not td.safe_import("xlrd"), reason="no xlrd")), + ]) + def cd_and_set_engine(self, request, datapath, monkeypatch): + """ + Change directory and set engine for ExcelFile objects. + """ + func = partial(pd.ExcelFile, engine=request.param) + monkeypatch.chdir(datapath("io", "data")) + monkeypatch.setattr(pd, 'ExcelFile', func) + + def test_excel_passes_na(self, ext): + + excel = ExcelFile('test4' + ext) + + parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, + na_values=['apple']) + expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, + na_values=['apple']) + expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + # 13967 + excel = ExcelFile('test5' + ext) + + parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, + na_values=['apple']) + expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, + na_values=['apple']) + expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize('arg', ['sheet', 'sheetname', 'parse_cols']) + def test_unexpected_kwargs_raises(self, ext, arg): + # gh-17964 + excel = ExcelFile('test1' + ext) + + kwarg = {arg: 'Sheet1'} + msg = "unexpected keyword argument `{}`".format(arg) + with pytest.raises(TypeError, match=msg): + pd.read_excel(excel, **kwarg) + + def test_excel_table_sheet_by_index(self, ext, df_ref): + + excel = ExcelFile('test1' + ext) + + df1 = pd.read_excel(excel, 0, index_col=0) + df2 = pd.read_excel(excel, 1, skiprows=[1], index_col=0) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + + df1 = excel.parse(0, index_col=0) + df2 = excel.parse(1, skiprows=[1], index_col=0) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + + df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) + tm.assert_frame_equal(df3, df4) + + df3 = excel.parse(0, index_col=0, skipfooter=1) + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + import xlrd + with pytest.raises(xlrd.XLRDError): + pd.read_excel(excel, 'asdf') + + def test_sheet_name(self, ext, df_ref): + filename = "test1" + sheet_name = "Sheet1" + + excel = ExcelFile(filename + ext) + df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc + df2_parse = excel.parse(index_col=0, + sheet_name=sheet_name) + + tm.assert_frame_equal(df1_parse, df_ref, check_names=False) + tm.assert_frame_equal(df2_parse, df_ref, check_names=False) + + def test_excel_read_buffer(self, ext): + + pth = 'test1' + ext + expected = pd.read_excel(pth, 'Sheet1', index_col=0) + + with open(pth, 'rb') as f: + xls = ExcelFile(f) + actual = pd.read_excel(xls, 'Sheet1', index_col=0) + tm.assert_frame_equal(expected, actual) + + def test_reader_closes_file(self, ext): + + f = open('test1' + ext, 'rb') + with ExcelFile(f) as xlsx: + # parses okay + pd.read_excel(xlsx, 'Sheet1', index_col=0) + + assert f.closed + + @td.skip_if_no('xlrd') @pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) class TestRoundTrip: From 0e34ab8bcc24d0d6a9555dda731b7957c76a124a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 8 Jun 2019 15:46:34 -0400 Subject: [PATCH 2/4] Added comment for xlrd test --- pandas/tests/io/test_excel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index a85ee8a00d079..b8bd0ebefa6db 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -830,7 +830,7 @@ def test_excel_table_sheet_by_index(self, ext, df_ref): df3 = excel.parse(0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) - import xlrd + import xlrd # will move to engine-specific tests as new ones are added with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, 'asdf') From b31f6166a1d57af112518d2280a582e47bd98cbc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 8 Jun 2019 15:52:20 -0400 Subject: [PATCH 3/4] Common fixture for read_ext --- pandas/tests/io/test_excel.py | 271 +++++++++++++++++----------------- 1 file changed, 138 insertions(+), 133 deletions(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index b8bd0ebefa6db..5dd5cee261e93 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -66,7 +66,14 @@ def df_ref(): return df_ref -@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) +@pytest.fixture(params=['.xls', '.xlsx', '.xlsm']) +def read_ext(request): + """ + Valid extensions for reading Excel files. + """ + return request.param + + class TestReaders: @pytest.fixture(autouse=True, params=[ @@ -84,45 +91,45 @@ def cd_and_set_engine(self, request, datapath, monkeypatch): monkeypatch.chdir(datapath("io", "data")) monkeypatch.setattr(pd, 'read_excel', func) - def test_usecols_int(self, ext, df_ref): + def test_usecols_int(self, read_ext, df_ref): df_ref = df_ref.reindex(columns=["A", "B", "C"]) # usecols as int with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): with ignore_xlrd_time_clock_warning(): - df1 = pd.read_excel("test1" + ext, "Sheet1", + df1 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols=3) # usecols as int with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): with ignore_xlrd_time_clock_warning(): - df2 = pd.read_excel("test1" + ext, "Sheet2", skiprows=[1], + df2 = pd.read_excel("test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=3) # TODO add index to xls file) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - def test_usecols_list(self, ext, df_ref): + def test_usecols_list(self, read_ext, df_ref): df_ref = df_ref.reindex(columns=['B', 'C']) - df1 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0, + df1 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, usecols=[0, 2, 3]) - df2 = pd.read_excel('test1' + ext, 'Sheet2', skiprows=[1], + df2 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], index_col=0, usecols=[0, 2, 3]) # TODO add index to xls file) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - def test_usecols_str(self, ext, df_ref): + def test_usecols_str(self, read_ext, df_ref): df1 = df_ref.reindex(columns=['A', 'B', 'C']) - df2 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0, + df2 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, usecols='A:D') - df3 = pd.read_excel('test1' + ext, 'Sheet2', skiprows=[1], + df3 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], index_col=0, usecols='A:D') # TODO add index to xls, read xls ignores index name ? @@ -130,18 +137,18 @@ def test_usecols_str(self, ext, df_ref): tm.assert_frame_equal(df3, df1, check_names=False) df1 = df_ref.reindex(columns=['B', 'C']) - df2 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0, + df2 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, usecols='A,C,D') - df3 = pd.read_excel('test1' + ext, 'Sheet2', skiprows=[1], + df3 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], index_col=0, usecols='A,C,D') # TODO add index to xls file tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) df1 = df_ref.reindex(columns=['B', 'C']) - df2 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0, + df2 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, usecols='A,C:D') - df3 = pd.read_excel('test1' + ext, 'Sheet2', skiprows=[1], + df3 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], index_col=0, usecols='A,C:D') tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) @@ -152,9 +159,9 @@ def test_usecols_str(self, ext, df_ref): [3, 0, 1], [3, 1, 0], ]) def test_usecols_diff_positional_int_columns_order( - self, ext, usecols, df_ref): + self, read_ext, usecols, df_ref): expected = df_ref[["A", "C"]] - result = pd.read_excel("test1" + ext, "Sheet1", + result = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols=usecols) tm.assert_frame_equal(result, expected, check_names=False) @@ -162,40 +169,40 @@ def test_usecols_diff_positional_int_columns_order( ["B", "D"], ["D", "B"] ]) def test_usecols_diff_positional_str_columns_order( - self, ext, usecols, df_ref): + self, read_ext, usecols, df_ref): expected = df_ref[["B", "D"]] expected.index = range(len(expected)) - result = pd.read_excel("test1" + ext, "Sheet1", usecols=usecols) + result = pd.read_excel("test1" + read_ext, "Sheet1", usecols=usecols) tm.assert_frame_equal(result, expected, check_names=False) - def test_read_excel_without_slicing(self, ext, df_ref): + def test_read_excel_without_slicing(self, read_ext, df_ref): expected = df_ref - result = pd.read_excel("test1" + ext, "Sheet1", index_col=0) + result = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0) tm.assert_frame_equal(result, expected, check_names=False) - def test_usecols_excel_range_str(self, ext, df_ref): + def test_usecols_excel_range_str(self, read_ext, df_ref): expected = df_ref[["C", "D"]] - result = pd.read_excel("test1" + ext, "Sheet1", + result = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A,D:E") tm.assert_frame_equal(result, expected, check_names=False) - def test_usecols_excel_range_str_invalid(self, ext): + def test_usecols_excel_range_str_invalid(self, read_ext): msg = "Invalid column name: E1" with pytest.raises(ValueError, match=msg): - pd.read_excel("test1" + ext, "Sheet1", usecols="D:E1") + pd.read_excel("test1" + read_ext, "Sheet1", usecols="D:E1") - def test_index_col_label_error(self, ext): + def test_index_col_label_error(self, read_ext): msg = "list indices must be integers.*, not str" with pytest.raises(TypeError, match=msg): - pd.read_excel("test1" + ext, "Sheet1", index_col=["A"], + pd.read_excel("test1" + read_ext, "Sheet1", index_col=["A"], usecols=["A", "C"]) - def test_index_col_empty(self, ext): + def test_index_col_empty(self, read_ext): # see gh-9208 - result = pd.read_excel("test1" + ext, "Sheet3", + result = pd.read_excel("test1" + read_ext, "Sheet3", index_col=["A", "B", "C"]) expected = DataFrame(columns=["D", "E", "F"], index=MultiIndex(levels=[[]] * 3, @@ -204,9 +211,9 @@ def test_index_col_empty(self, ext): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("index_col", [None, 2]) - def test_index_col_with_unnamed(self, ext, index_col): + def test_index_col_with_unnamed(self, read_ext, index_col): # see gh-18792 - result = pd.read_excel("test1" + ext, "Sheet4", index_col=index_col) + result = pd.read_excel("test1" + read_ext, "Sheet4", index_col=index_col) expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"]) if index_col: @@ -214,45 +221,45 @@ def test_index_col_with_unnamed(self, ext, index_col): tm.assert_frame_equal(result, expected) - def test_usecols_pass_non_existent_column(self, ext): + def test_usecols_pass_non_existent_column(self, read_ext): msg = ("Usecols do not match columns, " "columns expected but not found: " + r"\['E'\]") with pytest.raises(ValueError, match=msg): - pd.read_excel("test1" + ext, usecols=["E"]) + pd.read_excel("test1" + read_ext, usecols=["E"]) - def test_usecols_wrong_type(self, ext): + def test_usecols_wrong_type(self, read_ext): msg = ("'usecols' must either be list-like of " "all strings, all unicode, all integers or a callable.") with pytest.raises(ValueError, match=msg): - pd.read_excel("test1" + ext, usecols=["E1", 0]) + pd.read_excel("test1" + read_ext, usecols=["E1", 0]) - def test_excel_stop_iterator(self, ext): + def test_excel_stop_iterator(self, read_ext): - parsed = pd.read_excel('test2' + ext, 'Sheet1') + parsed = pd.read_excel('test2' + read_ext, 'Sheet1') expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) tm.assert_frame_equal(parsed, expected) - def test_excel_cell_error_na(self, ext): + def test_excel_cell_error_na(self, read_ext): - parsed = pd.read_excel('test3' + ext, 'Sheet1') + parsed = pd.read_excel('test3' + read_ext, 'Sheet1') expected = DataFrame([[np.nan]], columns=['Test']) tm.assert_frame_equal(parsed, expected) - def test_excel_table(self, ext, df_ref): + def test_excel_table(self, read_ext, df_ref): - df1 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0) - df2 = pd.read_excel('test1' + ext, 'Sheet2', skiprows=[1], + df1 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0) + df2 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], index_col=0) # TODO add index to file tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - df3 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0, skipfooter=1) + df3 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_reader_special_dtypes(self, ext): + def test_reader_special_dtypes(self, read_ext): expected = DataFrame.from_dict(OrderedDict([ ("IntCol", [1, 2, -3, 4, 0]), @@ -268,36 +275,36 @@ def test_reader_special_dtypes(self, ext): basename = 'test_types' # should read in correctly and infer types - actual = pd.read_excel(basename + ext, 'Sheet1') + actual = pd.read_excel(basename + read_ext, 'Sheet1') tm.assert_frame_equal(actual, expected) # if not coercing number, then int comes in as float float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - actual = pd.read_excel(basename + ext, 'Sheet1', convert_float=False) + actual = pd.read_excel(basename + read_ext, 'Sheet1', convert_float=False) tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) for icol, name in enumerate(expected.columns): - actual = pd.read_excel(basename + ext, 'Sheet1', index_col=icol) + actual = pd.read_excel(basename + read_ext, 'Sheet1', index_col=icol) exp = expected.set_index(name) tm.assert_frame_equal(actual, exp) # convert_float and converters should be different but both accepted expected["StrCol"] = expected["StrCol"].apply(str) - actual = pd.read_excel(basename + ext, 'Sheet1', + actual = pd.read_excel(basename + read_ext, 'Sheet1', converters={"StrCol": str}) tm.assert_frame_equal(actual, expected) no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - actual = pd.read_excel(basename + ext, 'Sheet1', convert_float=False, + actual = pd.read_excel(basename + read_ext, 'Sheet1', convert_float=False, converters={"StrCol": str}) tm.assert_frame_equal(actual, no_convert_float) # GH8212 - support for converters and missing values - def test_reader_converters(self, ext): + def test_reader_converters(self, read_ext): basename = 'test_converters' @@ -316,13 +323,13 @@ def test_reader_converters(self, ext): # should read in correctly and set types of single cells (not array # dtypes) - actual = pd.read_excel(basename + ext, 'Sheet1', converters=converters) + actual = pd.read_excel(basename + read_ext, 'Sheet1', converters=converters) tm.assert_frame_equal(actual, expected) - def test_reader_dtype(self, ext): + def test_reader_dtype(self, read_ext): # GH 8212 basename = 'testdtype' - actual = pd.read_excel(basename + ext) + actual = pd.read_excel(basename + read_ext) expected = DataFrame({ 'a': [1, 2, 3, 4], @@ -333,7 +340,7 @@ def test_reader_dtype(self, ext): tm.assert_frame_equal(actual, expected) - actual = pd.read_excel(basename + ext, + actual = pd.read_excel(basename + read_ext, dtype={'a': 'float64', 'b': 'float32', 'c': str}) @@ -344,7 +351,7 @@ def test_reader_dtype(self, ext): tm.assert_frame_equal(actual, expected) with pytest.raises(ValueError): - pd.read_excel(basename + ext, dtype={'d': 'int64'}) + pd.read_excel(basename + read_ext, dtype={'d': 'int64'}) @pytest.mark.parametrize("dtype,expected", [ (None, @@ -366,19 +373,19 @@ def test_reader_dtype(self, ext): "d": ["1", "2", np.nan, "4"] })), ]) - def test_reader_dtype_str(self, ext, dtype, expected): + def test_reader_dtype_str(self, read_ext, dtype, expected): # see gh-20377 basename = "testdtype" - actual = pd.read_excel(basename + ext, dtype=dtype) + actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) - def test_reading_all_sheets(self, ext): + def test_reading_all_sheets(self, read_ext): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. # See PR #9450 basename = 'test_multisheet' - dfs = pd.read_excel(basename + ext, sheet_name=None) + dfs = pd.read_excel(basename + read_ext, sheet_name=None) # ensure this is not alphabetical to test order preservation expected_keys = ['Charlie', 'Alpha', 'Beta'] tm.assert_contains_all(expected_keys, dfs.keys()) @@ -386,7 +393,7 @@ def test_reading_all_sheets(self, ext): # Ensure sheet order is preserved assert expected_keys == list(dfs.keys()) - def test_reading_multiple_specific_sheets(self, ext): + def test_reading_multiple_specific_sheets(self, read_ext): # Test reading specific sheetnames by specifying a mixed list # of integers and strings, and confirm that duplicated sheet # references (positions/names) are removed properly. @@ -395,93 +402,93 @@ def test_reading_multiple_specific_sheets(self, ext): basename = 'test_multisheet' # Explicitly request duplicates. Only the set should be returned. expected_keys = [2, 'Charlie', 'Charlie'] - dfs = pd.read_excel(basename + ext, sheet_name=expected_keys) + dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys) expected_keys = list(set(expected_keys)) tm.assert_contains_all(expected_keys, dfs.keys()) assert len(expected_keys) == len(dfs.keys()) - def test_reading_all_sheets_with_blank(self, ext): + def test_reading_all_sheets_with_blank(self, read_ext): # Test reading all sheetnames by setting sheetname to None, # In the case where some sheets are blank. # Issue #11711 basename = 'blank_with_header' - dfs = pd.read_excel(basename + ext, sheet_name=None) + dfs = pd.read_excel(basename + read_ext, sheet_name=None) expected_keys = ['Sheet1', 'Sheet2', 'Sheet3'] tm.assert_contains_all(expected_keys, dfs.keys()) # GH6403 - def test_read_excel_blank(self, ext): - actual = pd.read_excel('blank' + ext, 'Sheet1') + def test_read_excel_blank(self, read_ext): + actual = pd.read_excel('blank' + read_ext, 'Sheet1') tm.assert_frame_equal(actual, DataFrame()) - def test_read_excel_blank_with_header(self, ext): + def test_read_excel_blank_with_header(self, read_ext): expected = DataFrame(columns=['col_1', 'col_2']) - actual = pd.read_excel('blank_with_header' + ext, 'Sheet1') + actual = pd.read_excel('blank_with_header' + read_ext, 'Sheet1') tm.assert_frame_equal(actual, expected) - def test_date_conversion_overflow(self, ext): + def test_date_conversion_overflow(self, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'], [pd.Timestamp('2016-03-16'), 'Jack Black'], [1e+20, 'Timothy Brown']], columns=['DateColWithBigInt', 'StringCol']) - result = pd.read_excel('testdateoverflow' + ext) + result = pd.read_excel('testdateoverflow' + read_ext) tm.assert_frame_equal(result, expected) - def test_sheet_name(self, ext, df_ref): + def test_sheet_name(self, read_ext, df_ref): filename = "test1" sheet_name = "Sheet1" - df1 = pd.read_excel(filename + ext, + df1 = pd.read_excel(filename + read_ext, sheet_name=sheet_name, index_col=0) # doc with ignore_xlrd_time_clock_warning(): - df2 = pd.read_excel(filename + ext, index_col=0, + df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - def test_excel_read_buffer(self, ext): + def test_excel_read_buffer(self, read_ext): - pth = 'test1' + ext + pth = 'test1' + read_ext expected = pd.read_excel(pth, 'Sheet1', index_col=0) with open(pth, 'rb') as f: actual = pd.read_excel(f, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) - def test_bad_engine_raises(self, ext): + def test_bad_engine_raises(self, read_ext): bad_engine = 'foo' with pytest.raises(ValueError, match="Unknown engine: foo"): pd.read_excel('', engine=bad_engine) @tm.network - def test_read_from_http_url(self, ext): + def test_read_from_http_url(self, read_ext): url = ('https://raw.github.com/pandas-dev/pandas/master/' - 'pandas/tests/io/data/test1' + ext) + 'pandas/tests/io/data/test1' + read_ext) url_table = pd.read_excel(url) - local_table = pd.read_excel('test1' + ext) + local_table = pd.read_excel('test1' + read_ext) tm.assert_frame_equal(url_table, local_table) @td.skip_if_not_us_locale - def test_read_from_s3_url(self, ext, s3_resource): + def test_read_from_s3_url(self, read_ext, s3_resource): # Bucket "pandas-test" created in tests/io/conftest.py - with open('test1' + ext, "rb") as f: - s3_resource.Bucket("pandas-test").put_object(Key="test1" + ext, + with open('test1' + read_ext, "rb") as f: + s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) - url = ('s3://pandas-test/test1' + ext) + url = ('s3://pandas-test/test1' + read_ext) url_table = pd.read_excel(url) - local_table = pd.read_excel('test1' + ext) + local_table = pd.read_excel('test1' + read_ext) tm.assert_frame_equal(url_table, local_table) @pytest.mark.slow # ignore warning from old xlrd @pytest.mark.filterwarnings("ignore:This metho:PendingDeprecationWarning") - def test_read_from_file_url(self, ext, datapath): + def test_read_from_file_url(self, read_ext, datapath): # FILE - localtable = os.path.join(datapath("io", "data"), 'test1' + ext) + localtable = os.path.join(datapath("io", "data"), 'test1' + read_ext) local_table = pd.read_excel(localtable) try: @@ -494,34 +501,34 @@ def test_read_from_file_url(self, ext, datapath): tm.assert_frame_equal(url_table, local_table) - def test_read_from_pathlib_path(self, ext): + def test_read_from_pathlib_path(self, read_ext): # GH12655 from pathlib import Path - str_path = 'test1' + ext + str_path = 'test1' + read_ext expected = pd.read_excel(str_path, 'Sheet1', index_col=0) - path_obj = Path('test1' + ext) + path_obj = Path('test1' + read_ext) actual = pd.read_excel(path_obj, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) @td.skip_if_no('py.path') - def test_read_from_py_localpath(self, ext): + def test_read_from_py_localpath(self, read_ext): # GH12655 from py.path import local as LocalPath - str_path = os.path.join('test1' + ext) + str_path = os.path.join('test1' + read_ext) expected = pd.read_excel(str_path, 'Sheet1', index_col=0) - path_obj = LocalPath().join('test1' + ext) + path_obj = LocalPath().join('test1' + read_ext) actual = pd.read_excel(path_obj, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) - def test_reader_seconds(self, ext): + def test_reader_seconds(self, read_ext): # Test reading times with and without milliseconds. GH5945. expected = DataFrame.from_dict({"Time": [time(1, 2, 3), @@ -536,16 +543,16 @@ def test_reader_seconds(self, ext): time(16, 37, 0, 900000), time(18, 20, 54)]}) - actual = pd.read_excel('times_1900' + ext, 'Sheet1') + actual = pd.read_excel('times_1900' + read_ext, 'Sheet1') tm.assert_frame_equal(actual, expected) - actual = pd.read_excel('times_1904' + ext, 'Sheet1') + actual = pd.read_excel('times_1904' + read_ext, 'Sheet1') tm.assert_frame_equal(actual, expected) - def test_read_excel_multiindex(self, ext): + def test_read_excel_multiindex(self, read_ext): # see gh-4679 mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) - mi_file = "testmultiindex" + ext + mi_file = "testmultiindex" + read_ext # "mi_column" sheet expected = DataFrame([[1, 2.5, pd.Timestamp("2015-01-01"), True], @@ -609,20 +616,20 @@ def test_read_excel_multiindex(self, ext): header=[0, 1], skiprows=2) tm.assert_frame_equal(actual, expected) - def test_read_excel_multiindex_header_only(self, ext): + def test_read_excel_multiindex_header_only(self, read_ext): # see gh-11733. # # Don't try to parse a header name if there isn't one. - mi_file = "testmultiindex" + ext + mi_file = "testmultiindex" + read_ext result = pd.read_excel(mi_file, "index_col_none", header=[0, 1]) exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")]) expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) tm.assert_frame_equal(result, expected) - def test_excel_old_index_format(self, ext): + def test_excel_old_index_format(self, read_ext): # see gh-4679 - filename = "test_index_name_pre17" + ext + filename = "test_index_name_pre17" + read_ext # We detect headers to determine if index names exist, so # that "index" name in the "names" version of the data will @@ -680,20 +687,20 @@ def test_excel_old_index_format(self, ext): actual = pd.read_excel(filename, "multi_no_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) - def test_read_excel_bool_header_arg(self, ext): + def test_read_excel_bool_header_arg(self, read_ext): # GH 6114 for arg in [True, False]: with pytest.raises(TypeError): - pd.read_excel('test1' + ext, header=arg) + pd.read_excel('test1' + read_ext, header=arg) - def test_read_excel_chunksize(self, ext): + def test_read_excel_chunksize(self, read_ext): # GH 8011 with pytest.raises(NotImplementedError): - pd.read_excel('test1' + ext, chunksize=100) + pd.read_excel('test1' + read_ext, chunksize=100) - def test_read_excel_skiprows_list(self, ext): + def test_read_excel_skiprows_list(self, read_ext): # GH 4903 - actual = pd.read_excel('testskiprows' + ext, + actual = pd.read_excel('testskiprows' + read_ext, 'skiprows_list', skiprows=[0, 2]) expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], [2, 3.5, pd.Timestamp('2015-01-02'), False], @@ -702,35 +709,35 @@ def test_read_excel_skiprows_list(self, ext): columns=['a', 'b', 'c', 'd']) tm.assert_frame_equal(actual, expected) - actual = pd.read_excel('testskiprows' + ext, + actual = pd.read_excel('testskiprows' + read_ext, 'skiprows_list', skiprows=np.array([0, 2])) tm.assert_frame_equal(actual, expected) - def test_read_excel_nrows(self, ext): + def test_read_excel_nrows(self, read_ext): # GH 16645 num_rows_to_pull = 5 - actual = pd.read_excel('test1' + ext, nrows=num_rows_to_pull) - expected = pd.read_excel('test1' + ext) + actual = pd.read_excel('test1' + read_ext, nrows=num_rows_to_pull) + expected = pd.read_excel('test1' + read_ext) expected = expected[:num_rows_to_pull] tm.assert_frame_equal(actual, expected) - def test_read_excel_nrows_greater_than_nrows_in_file(self, ext): + def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext): # GH 16645 - expected = pd.read_excel('test1' + ext) + expected = pd.read_excel('test1' + read_ext) num_records_in_file = len(expected) num_rows_to_pull = num_records_in_file + 10 - actual = pd.read_excel('test1' + ext, nrows=num_rows_to_pull) + actual = pd.read_excel('test1' + read_ext, nrows=num_rows_to_pull) tm.assert_frame_equal(actual, expected) - def test_read_excel_nrows_non_integer_parameter(self, ext): + def test_read_excel_nrows_non_integer_parameter(self, read_ext): # GH 16645 msg = "'nrows' must be an integer >=0" with pytest.raises(ValueError, match=msg): - pd.read_excel('test1' + ext, nrows='5') + pd.read_excel('test1' + read_ext, nrows='5') - def test_read_excel_squeeze(self, ext): + def test_read_excel_squeeze(self, read_ext): # GH 12157 - f = 'test_squeeze' + ext + f = 'test_squeeze' + read_ext actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True) expected = pd.Series([2, 3, 4], [4, 5, 6], name='b') @@ -747,7 +754,6 @@ def test_read_excel_squeeze(self, ext): tm.assert_series_equal(actual, expected) -@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) class TestExcelFileRead: @pytest.fixture(autouse=True, params=[ @@ -765,9 +771,9 @@ def cd_and_set_engine(self, request, datapath, monkeypatch): monkeypatch.chdir(datapath("io", "data")) monkeypatch.setattr(pd, 'ExcelFile', func) - def test_excel_passes_na(self, ext): + def test_excel_passes_na(self, read_ext): - excel = ExcelFile('test4' + ext) + excel = ExcelFile('test4' + read_ext) parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, na_values=['apple']) @@ -782,7 +788,7 @@ def test_excel_passes_na(self, ext): tm.assert_frame_equal(parsed, expected) # 13967 - excel = ExcelFile('test5' + ext) + excel = ExcelFile('test5' + read_ext) parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, na_values=['apple']) @@ -797,18 +803,18 @@ def test_excel_passes_na(self, ext): tm.assert_frame_equal(parsed, expected) @pytest.mark.parametrize('arg', ['sheet', 'sheetname', 'parse_cols']) - def test_unexpected_kwargs_raises(self, ext, arg): + def test_unexpected_kwargs_raises(self, read_ext, arg): # gh-17964 - excel = ExcelFile('test1' + ext) + excel = ExcelFile('test1' + read_ext) kwarg = {arg: 'Sheet1'} msg = "unexpected keyword argument `{}`".format(arg) with pytest.raises(TypeError, match=msg): pd.read_excel(excel, **kwarg) - def test_excel_table_sheet_by_index(self, ext, df_ref): + def test_excel_table_sheet_by_index(self, read_ext, df_ref): - excel = ExcelFile('test1' + ext) + excel = ExcelFile('test1' + read_ext) df1 = pd.read_excel(excel, 0, index_col=0) df2 = pd.read_excel(excel, 1, skiprows=[1], index_col=0) @@ -834,11 +840,11 @@ def test_excel_table_sheet_by_index(self, ext, df_ref): with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, 'asdf') - def test_sheet_name(self, ext, df_ref): + def test_sheet_name(self, read_ext, df_ref): filename = "test1" sheet_name = "Sheet1" - excel = ExcelFile(filename + ext) + excel = ExcelFile(filename + read_ext) df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc df2_parse = excel.parse(index_col=0, sheet_name=sheet_name) @@ -846,9 +852,9 @@ def test_sheet_name(self, ext, df_ref): tm.assert_frame_equal(df1_parse, df_ref, check_names=False) tm.assert_frame_equal(df2_parse, df_ref, check_names=False) - def test_excel_read_buffer(self, ext): + def test_excel_read_buffer(self, read_ext): - pth = 'test1' + ext + pth = 'test1' + read_ext expected = pd.read_excel(pth, 'Sheet1', index_col=0) with open(pth, 'rb') as f: @@ -856,9 +862,9 @@ def test_excel_read_buffer(self, ext): actual = pd.read_excel(xls, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) - def test_reader_closes_file(self, ext): + def test_reader_closes_file(self, read_ext): - f = open('test1' + ext, 'rb') + f = open('test1' + read_ext, 'rb') with ExcelFile(f) as xlsx: # parses okay pd.read_excel(xlsx, 'Sheet1', index_col=0) @@ -1068,7 +1074,6 @@ def test_read_excel_parse_dates(self, ext): @td.skip_if_no('xlrd') -@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) class TestXlrdReader: """ This is the base class for the xlrd tests, and 3 different file formats @@ -1076,14 +1081,14 @@ class TestXlrdReader: """ @td.skip_if_no("xlwt") - def test_read_xlrd_book(self, ext, frame): + def test_read_xlrd_book(self, read_ext, frame): import xlrd df = frame engine = "xlrd" sheet_name = "SheetA" - with ensure_clean(ext) as pth: + with ensure_clean(read_ext) as pth: df.to_excel(pth, sheet_name) book = xlrd.open_workbook(pth) From cb5d640c4c76dc6927507d8ebcc93956390f1b2d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 8 Jun 2019 15:54:03 -0400 Subject: [PATCH 4/4] Lint fixup --- pandas/tests/io/test_excel.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 5dd5cee261e93..1f6839fa5dc52 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -213,7 +213,8 @@ def test_index_col_empty(self, read_ext): @pytest.mark.parametrize("index_col", [None, 2]) def test_index_col_with_unnamed(self, read_ext, index_col): # see gh-18792 - result = pd.read_excel("test1" + read_ext, "Sheet4", index_col=index_col) + result = pd.read_excel( + "test1" + read_ext, "Sheet4", index_col=index_col) expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"]) if index_col: @@ -256,7 +257,8 @@ def test_excel_table(self, read_ext, df_ref): tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - df3 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, skipfooter=1) + df3 = pd.read_excel( + 'test1' + read_ext, 'Sheet1', index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_reader_special_dtypes(self, read_ext): @@ -282,12 +284,14 @@ def test_reader_special_dtypes(self, read_ext): float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - actual = pd.read_excel(basename + read_ext, 'Sheet1', convert_float=False) + actual = pd.read_excel( + basename + read_ext, 'Sheet1', convert_float=False) tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) for icol, name in enumerate(expected.columns): - actual = pd.read_excel(basename + read_ext, 'Sheet1', index_col=icol) + actual = pd.read_excel( + basename + read_ext, 'Sheet1', index_col=icol) exp = expected.set_index(name) tm.assert_frame_equal(actual, exp) @@ -299,8 +303,9 @@ def test_reader_special_dtypes(self, read_ext): no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - actual = pd.read_excel(basename + read_ext, 'Sheet1', convert_float=False, - converters={"StrCol": str}) + actual = pd.read_excel( + basename + read_ext, 'Sheet1', + convert_float=False, converters={"StrCol": str}) tm.assert_frame_equal(actual, no_convert_float) # GH8212 - support for converters and missing values @@ -323,7 +328,8 @@ def test_reader_converters(self, read_ext): # should read in correctly and set types of single cells (not array # dtypes) - actual = pd.read_excel(basename + read_ext, 'Sheet1', converters=converters) + actual = pd.read_excel( + basename + read_ext, 'Sheet1', converters=converters) tm.assert_frame_equal(actual, expected) def test_reader_dtype(self, read_ext): @@ -474,8 +480,8 @@ def test_read_from_http_url(self, read_ext): def test_read_from_s3_url(self, read_ext, s3_resource): # Bucket "pandas-test" created in tests/io/conftest.py with open('test1' + read_ext, "rb") as f: - s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, - Body=f) + s3_resource.Bucket("pandas-test").put_object( + Key="test1" + read_ext, Body=f) url = ('s3://pandas-test/test1' + read_ext) url_table = pd.read_excel(url) @@ -839,7 +845,7 @@ def test_excel_table_sheet_by_index(self, read_ext, df_ref): import xlrd # will move to engine-specific tests as new ones are added with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, 'asdf') - + def test_sheet_name(self, read_ext, df_ref): filename = "test1" sheet_name = "Sheet1"