From 88bbfe1d5f3517d0855a998c96b6bf6a5b111797 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 24 Oct 2023 13:51:20 -0700 Subject: [PATCH 1/2] TST: check names in test_readers.py --- pandas/tests/io/excel/test_readers.py | 87 ++++++++++++++++++--------- 1 file changed, 57 insertions(+), 30 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index c5bf935b0d54d..34fbd3d05fd7f 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -117,6 +117,10 @@ def read_ext(engine_and_read_ext): return read_ext +def adjust_expected(expected: DataFrame, read_ext: str) -> None: + expected.index.name = None + + class TestReaders: @pytest.fixture(autouse=True) def cd_and_set_engine(self, engine, datapath, monkeypatch): @@ -203,7 +207,9 @@ def test_usecols_list(self, request, engine, read_ext, df_ref): ) ) - df_ref = df_ref.reindex(columns=["B", "C"]) + expected = df_ref[["B", "C"]] + adjust_expected(expected, read_ext) + df1 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=[0, 2, 3] ) @@ -216,8 +222,8 @@ def test_usecols_list(self, request, engine, read_ext, df_ref): ) # TODO add index to xls file) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) def test_usecols_str(self, request, engine, read_ext, df_ref): if engine == "pyxlsb": @@ -227,7 +233,9 @@ def test_usecols_str(self, request, engine, read_ext, df_ref): ) ) - df1 = df_ref.reindex(columns=["A", "B", "C"]) + expected = df_ref[["A", "B", "C"]] + adjust_expected(expected, read_ext) + df2 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A:D" ) @@ -240,10 +248,12 @@ def test_usecols_str(self, request, engine, read_ext, df_ref): ) # TODO add index to xls, read xls ignores index name ? - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df3, expected) + + expected = df_ref[["B", "C"]] + adjust_expected(expected, read_ext) - df1 = df_ref.reindex(columns=["B", "C"]) df2 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C,D" ) @@ -255,10 +265,9 @@ def test_usecols_str(self, request, engine, read_ext, df_ref): usecols="A,C,D", ) # TODO add index to xls file - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df3, expected) - df1 = df_ref.reindex(columns=["B", "C"]) df2 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C:D" ) @@ -269,8 +278,8 @@ def test_usecols_str(self, request, engine, read_ext, df_ref): index_col=0, usecols="A,C:D", ) - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df3, expected) @pytest.mark.parametrize( "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] @@ -286,10 +295,12 @@ def test_usecols_diff_positional_int_columns_order( ) expected = df_ref[["A", "C"]] + adjust_expected(expected, read_ext) + result = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=usecols ) - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]]) def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref): @@ -297,7 +308,7 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r expected.index = range(len(expected)) result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols) - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): if engine == "pyxlsb": @@ -308,8 +319,10 @@ def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): ) expected = df_ref + adjust_expected(expected, read_ext) + result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): if engine == "pyxlsb": @@ -320,10 +333,12 @@ def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): ) expected = df_ref[["C", "D"]] + adjust_expected(expected, read_ext) + result = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,D:E" ) - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) def test_usecols_excel_range_str_invalid(self, read_ext): msg = "Invalid column name: E1" @@ -425,13 +440,16 @@ def test_excel_table(self, request, engine, read_ext, df_ref): ) ) + expected = df_ref + adjust_expected(expected, read_ext) + df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) df2 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet2", skiprows=[1], index_col=0 ) # TODO add index to file - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) df3 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, skipfooter=1 @@ -527,7 +545,7 @@ def test_reader_dtype(self, read_ext): "c": [1, 2, 3, 4], "d": [1.0, 2.0, np.nan, 4.0], } - ).reindex(columns=["a", "b", "c", "d"]) + ) tm.assert_frame_equal(actual, expected) @@ -817,13 +835,16 @@ def test_sheet_name(self, request, read_ext, engine, df_ref): filename = "test1" sheet_name = "Sheet1" + expected = df_ref + adjust_expected(expected, read_ext) + df1 = pd.read_excel( filename + read_ext, sheet_name=sheet_name, index_col=0 ) # doc df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) def test_excel_read_buffer(self, read_ext): pth = "test1" + read_ext @@ -1051,7 +1072,7 @@ def test_read_excel_multiindex(self, request, engine, read_ext): expected.columns = ["a", "b", "c", "d"] actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected, check_names=False) + tm.assert_frame_equal(actual, expected) # "both" sheet expected.columns = mi @@ -1059,7 +1080,7 @@ def test_read_excel_multiindex(self, request, engine, read_ext): actual = pd.read_excel( mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1] ) - tm.assert_frame_equal(actual, expected, check_names=False) + tm.assert_frame_equal(actual, expected) # "mi_index_name" sheet expected.columns = ["a", "b", "c", "d"] @@ -1227,7 +1248,7 @@ def test_excel_old_index_format(self, read_ext): expected.index = mi actual = pd.read_excel(filename, sheet_name="multi_no_names", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected, check_names=False) + tm.assert_frame_equal(actual, expected) def test_read_excel_bool_header_arg(self, read_ext): # GH 6114 @@ -1572,17 +1593,20 @@ def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): ) ) + expected = df_ref + adjust_expected(expected, read_ext) + with pd.ExcelFile("test1" + read_ext) as excel: df1 = pd.read_excel(excel, sheet_name=0, index_col=0) df2 = pd.read_excel(excel, sheet_name=1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) with pd.ExcelFile("test1" + read_ext) as excel: df1 = excel.parse(0, index_col=0) df2 = excel.parse(1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) with pd.ExcelFile("test1" + read_ext) as excel: df3 = pd.read_excel(excel, sheet_name=0, index_col=0, skipfooter=1) @@ -1601,6 +1625,9 @@ def test_sheet_name(self, request, engine, read_ext, df_ref): ) ) + expected = df_ref + adjust_expected(expected, read_ext) + filename = "test1" sheet_name = "Sheet1" @@ -1610,8 +1637,8 @@ def test_sheet_name(self, request, engine, read_ext, df_ref): with pd.ExcelFile(filename + read_ext) as excel: df2_parse = excel.parse(index_col=0, sheet_name=sheet_name) - tm.assert_frame_equal(df1_parse, df_ref, check_names=False) - tm.assert_frame_equal(df2_parse, df_ref, check_names=False) + tm.assert_frame_equal(df1_parse, expected) + tm.assert_frame_equal(df2_parse, expected) @pytest.mark.parametrize( "sheet_name", From 29a0eebbb42e47b0b597ccec506d673bdefb676c Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 24 Oct 2023 13:58:32 -0700 Subject: [PATCH 2/2] de-duplicate --- pandas/tests/io/excel/test_readers.py | 136 ++++++-------------------- 1 file changed, 28 insertions(+), 108 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 34fbd3d05fd7f..74fe5166df65f 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -121,6 +121,15 @@ def adjust_expected(expected: DataFrame, read_ext: str) -> None: expected.index.name = None +def xfail_datetimes_with_pyxlsb(engine, request): + if engine == "pyxlsb": + request.applymarker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + + class TestReaders: @pytest.fixture(autouse=True) def cd_and_set_engine(self, engine, datapath, monkeypatch): @@ -200,12 +209,7 @@ def test_usecols_int(self, read_ext): ) def test_usecols_list(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref[["B", "C"]] adjust_expected(expected, read_ext) @@ -226,12 +230,7 @@ def test_usecols_list(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(df2, expected) def test_usecols_str(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref[["A", "B", "C"]] adjust_expected(expected, read_ext) @@ -287,12 +286,7 @@ def test_usecols_str(self, request, engine, read_ext, df_ref): def test_usecols_diff_positional_int_columns_order( self, request, engine, read_ext, usecols, df_ref ): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref[["A", "C"]] adjust_expected(expected, read_ext) @@ -311,12 +305,7 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r tm.assert_frame_equal(result, expected) def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref adjust_expected(expected, read_ext) @@ -325,12 +314,7 @@ def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(result, expected) def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref[["C", "D"]] adjust_expected(expected, read_ext) @@ -415,12 +399,7 @@ def test_excel_stop_iterator(self, read_ext): tm.assert_frame_equal(parsed, expected) def test_excel_cell_error_na(self, request, engine, read_ext): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) # https://github.com/tafia/calamine/issues/355 if engine == "calamine" and read_ext == ".ods": @@ -433,12 +412,7 @@ def test_excel_cell_error_na(self, request, engine, read_ext): tm.assert_frame_equal(parsed, expected) def test_excel_table(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref adjust_expected(expected, read_ext) @@ -457,12 +431,7 @@ def test_excel_table(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_reader_special_dtypes(self, request, engine, read_ext): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = DataFrame.from_dict( { @@ -795,12 +764,7 @@ def test_exception_message_includes_sheet_name(self, read_ext): @pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl") def test_date_conversion_overflow(self, request, engine, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = DataFrame( [ @@ -826,12 +790,8 @@ def test_date_conversion_overflow(self, request, engine, read_ext): tm.assert_frame_equal(result, expected) def test_sheet_name(self, request, read_ext, engine, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) + filename = "test1" sheet_name = "Sheet1" @@ -995,12 +955,7 @@ def test_close_from_py_localpath(self, read_ext): f.read() def test_reader_seconds(self, request, engine, read_ext): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) # GH 55045 if engine == "calamine" and read_ext == ".ods": @@ -1037,12 +992,7 @@ def test_reader_seconds(self, request, engine, read_ext): def test_read_excel_multiindex(self, request, engine, read_ext): # see gh-4679 - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) # https://github.com/tafia/calamine/issues/354 if engine == "calamine" and read_ext == ".ods": @@ -1136,12 +1086,7 @@ def test_read_excel_multiindex_blank_after_name( self, request, engine, read_ext, sheet_name, idx_lvl2 ): # GH34673 - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb (GH4679" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) mi_file = "testmultiindex" + read_ext mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) @@ -1259,12 +1204,7 @@ def test_read_excel_bool_header_arg(self, read_ext): def test_read_excel_skiprows(self, request, engine, read_ext): # GH 4903 - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) actual = pd.read_excel( "testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2] @@ -1314,12 +1254,7 @@ def test_read_excel_skiprows(self, request, engine, read_ext): def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): # GH 4903 - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) actual = pd.read_excel( "testskiprows" + read_ext, @@ -1586,12 +1521,7 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): tm.assert_frame_equal(parsed, expected) def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref adjust_expected(expected, read_ext) @@ -1618,12 +1548,7 @@ def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_sheet_name(self, request, engine, read_ext, df_ref): - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref adjust_expected(expected, read_ext) @@ -1713,12 +1638,7 @@ def test_header_with_index_col(self, filename): def test_read_datetime_multiindex(self, request, engine, read_ext): # GH 34748 - if engine == "pyxlsb": - request.applymarker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) f = "test_datetime_mi" + read_ext with pd.ExcelFile(f) as excel: