From 88bbfe1d5f3517d0855a998c96b6bf6a5b111797 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 24 Oct 2023 13:51:20 -0700
Subject: [PATCH 1/2] TST: check names in test_readers.py

---
 pandas/tests/io/excel/test_readers.py | 87 ++++++++++++++++++---------
 1 file changed, 57 insertions(+), 30 deletions(-)

diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index c5bf935b0d54d..34fbd3d05fd7f 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -117,6 +117,10 @@ def read_ext(engine_and_read_ext):
     return read_ext
 
 
+def adjust_expected(expected: DataFrame, read_ext: str) -> None:
+    expected.index.name = None
+
+
 class TestReaders:
     @pytest.fixture(autouse=True)
     def cd_and_set_engine(self, engine, datapath, monkeypatch):
@@ -203,7 +207,9 @@ def test_usecols_list(self, request, engine, read_ext, df_ref):
                 )
             )
 
-        df_ref = df_ref.reindex(columns=["B", "C"])
+        expected = df_ref[["B", "C"]]
+        adjust_expected(expected, read_ext)
+
         df1 = pd.read_excel(
             "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=[0, 2, 3]
         )
@@ -216,8 +222,8 @@ def test_usecols_list(self, request, engine, read_ext, df_ref):
         )
 
         # TODO add index to xls file)
-        tm.assert_frame_equal(df1, df_ref, check_names=False)
-        tm.assert_frame_equal(df2, df_ref, check_names=False)
+        tm.assert_frame_equal(df1, expected)
+        tm.assert_frame_equal(df2, expected)
 
     def test_usecols_str(self, request, engine, read_ext, df_ref):
         if engine == "pyxlsb":
@@ -227,7 +233,9 @@ def test_usecols_str(self, request, engine, read_ext, df_ref):
                 )
             )
 
-        df1 = df_ref.reindex(columns=["A", "B", "C"])
+        expected = df_ref[["A", "B", "C"]]
+        adjust_expected(expected, read_ext)
+
         df2 = pd.read_excel(
             "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A:D"
         )
@@ -240,10 +248,12 @@ def test_usecols_str(self, request, engine, read_ext, df_ref):
         )
 
         # TODO add index to xls, read xls ignores index name ?
-        tm.assert_frame_equal(df2, df1, check_names=False)
-        tm.assert_frame_equal(df3, df1, check_names=False)
+        tm.assert_frame_equal(df2, expected)
+        tm.assert_frame_equal(df3, expected)
+
+        expected = df_ref[["B", "C"]]
+        adjust_expected(expected, read_ext)
 
-        df1 = df_ref.reindex(columns=["B", "C"])
         df2 = pd.read_excel(
             "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C,D"
         )
@@ -255,10 +265,9 @@ def test_usecols_str(self, request, engine, read_ext, df_ref):
             usecols="A,C,D",
         )
         # TODO add index to xls file
-        tm.assert_frame_equal(df2, df1, check_names=False)
-        tm.assert_frame_equal(df3, df1, check_names=False)
+        tm.assert_frame_equal(df2, expected)
+        tm.assert_frame_equal(df3, expected)
 
-        df1 = df_ref.reindex(columns=["B", "C"])
         df2 = pd.read_excel(
             "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C:D"
         )
@@ -269,8 +278,8 @@ def test_usecols_str(self, request, engine, read_ext, df_ref):
             index_col=0,
             usecols="A,C:D",
         )
-        tm.assert_frame_equal(df2, df1, check_names=False)
-        tm.assert_frame_equal(df3, df1, check_names=False)
+        tm.assert_frame_equal(df2, expected)
+        tm.assert_frame_equal(df3, expected)
 
     @pytest.mark.parametrize(
         "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]]
@@ -286,10 +295,12 @@ def test_usecols_diff_positional_int_columns_order(
             )
 
         expected = df_ref[["A", "C"]]
+        adjust_expected(expected, read_ext)
+
         result = pd.read_excel(
             "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=usecols
         )
-        tm.assert_frame_equal(result, expected, check_names=False)
+        tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]])
     def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref):
@@ -297,7 +308,7 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r
         expected.index = range(len(expected))
 
         result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols)
-        tm.assert_frame_equal(result, expected, check_names=False)
+        tm.assert_frame_equal(result, expected)
 
     def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref):
         if engine == "pyxlsb":
@@ -308,8 +319,10 @@ def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref):
             )
 
         expected = df_ref
+        adjust_expected(expected, read_ext)
+
         result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
-        tm.assert_frame_equal(result, expected, check_names=False)
+        tm.assert_frame_equal(result, expected)
 
     def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref):
         if engine == "pyxlsb":
@@ -320,10 +333,12 @@ def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref):
             )
 
         expected = df_ref[["C", "D"]]
+        adjust_expected(expected, read_ext)
+
         result = pd.read_excel(
             "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,D:E"
         )
-        tm.assert_frame_equal(result, expected, check_names=False)
+        tm.assert_frame_equal(result, expected)
 
     def test_usecols_excel_range_str_invalid(self, read_ext):
         msg = "Invalid column name: E1"
@@ -425,13 +440,16 @@ def test_excel_table(self, request, engine, read_ext, df_ref):
                 )
             )
 
+        expected = df_ref
+        adjust_expected(expected, read_ext)
+
         df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
         df2 = pd.read_excel(
             "test1" + read_ext, sheet_name="Sheet2", skiprows=[1], index_col=0
         )
         # TODO add index to file
-        tm.assert_frame_equal(df1, df_ref, check_names=False)
-        tm.assert_frame_equal(df2, df_ref, check_names=False)
+        tm.assert_frame_equal(df1, expected)
+        tm.assert_frame_equal(df2, expected)
 
         df3 = pd.read_excel(
             "test1" + read_ext, sheet_name="Sheet1", index_col=0, skipfooter=1
@@ -527,7 +545,7 @@ def test_reader_dtype(self, read_ext):
                 "c": [1, 2, 3, 4],
                 "d": [1.0, 2.0, np.nan, 4.0],
             }
-        ).reindex(columns=["a", "b", "c", "d"])
+        )
 
         tm.assert_frame_equal(actual, expected)
 
@@ -817,13 +835,16 @@ def test_sheet_name(self, request, read_ext, engine, df_ref):
         filename = "test1"
         sheet_name = "Sheet1"
 
+        expected = df_ref
+        adjust_expected(expected, read_ext)
+
         df1 = pd.read_excel(
             filename + read_ext, sheet_name=sheet_name, index_col=0
         )  # doc
         df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name)
 
-        tm.assert_frame_equal(df1, df_ref, check_names=False)
-        tm.assert_frame_equal(df2, df_ref, check_names=False)
+        tm.assert_frame_equal(df1, expected)
+        tm.assert_frame_equal(df2, expected)
 
     def test_excel_read_buffer(self, read_ext):
         pth = "test1" + read_ext
@@ -1051,7 +1072,7 @@ def test_read_excel_multiindex(self, request, engine, read_ext):
         expected.columns = ["a", "b", "c", "d"]
 
         actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1])
-        tm.assert_frame_equal(actual, expected, check_names=False)
+        tm.assert_frame_equal(actual, expected)
 
         # "both" sheet
         expected.columns = mi
@@ -1059,7 +1080,7 @@ def test_read_excel_multiindex(self, request, engine, read_ext):
         actual = pd.read_excel(
             mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1]
         )
-        tm.assert_frame_equal(actual, expected, check_names=False)
+        tm.assert_frame_equal(actual, expected)
 
         # "mi_index_name" sheet
         expected.columns = ["a", "b", "c", "d"]
@@ -1227,7 +1248,7 @@ def test_excel_old_index_format(self, read_ext):
         expected.index = mi
 
         actual = pd.read_excel(filename, sheet_name="multi_no_names", index_col=[0, 1])
-        tm.assert_frame_equal(actual, expected, check_names=False)
+        tm.assert_frame_equal(actual, expected)
 
     def test_read_excel_bool_header_arg(self, read_ext):
         # GH 6114
@@ -1572,17 +1593,20 @@ def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref):
                 )
             )
 
+        expected = df_ref
+        adjust_expected(expected, read_ext)
+
         with pd.ExcelFile("test1" + read_ext) as excel:
             df1 = pd.read_excel(excel, sheet_name=0, index_col=0)
             df2 = pd.read_excel(excel, sheet_name=1, skiprows=[1], index_col=0)
-        tm.assert_frame_equal(df1, df_ref, check_names=False)
-        tm.assert_frame_equal(df2, df_ref, check_names=False)
+        tm.assert_frame_equal(df1, expected)
+        tm.assert_frame_equal(df2, expected)
 
         with pd.ExcelFile("test1" + read_ext) as excel:
             df1 = excel.parse(0, index_col=0)
             df2 = excel.parse(1, skiprows=[1], index_col=0)
-        tm.assert_frame_equal(df1, df_ref, check_names=False)
-        tm.assert_frame_equal(df2, df_ref, check_names=False)
+        tm.assert_frame_equal(df1, expected)
+        tm.assert_frame_equal(df2, expected)
 
         with pd.ExcelFile("test1" + read_ext) as excel:
             df3 = pd.read_excel(excel, sheet_name=0, index_col=0, skipfooter=1)
@@ -1601,6 +1625,9 @@ def test_sheet_name(self, request, engine, read_ext, df_ref):
                 )
             )
 
+        expected = df_ref
+        adjust_expected(expected, read_ext)
+
         filename = "test1"
         sheet_name = "Sheet1"
 
@@ -1610,8 +1637,8 @@ def test_sheet_name(self, request, engine, read_ext, df_ref):
         with pd.ExcelFile(filename + read_ext) as excel:
             df2_parse = excel.parse(index_col=0, sheet_name=sheet_name)
 
-        tm.assert_frame_equal(df1_parse, df_ref, check_names=False)
-        tm.assert_frame_equal(df2_parse, df_ref, check_names=False)
+        tm.assert_frame_equal(df1_parse, expected)
+        tm.assert_frame_equal(df2_parse, expected)
 
     @pytest.mark.parametrize(
         "sheet_name",

From 29a0eebbb42e47b0b597ccec506d673bdefb676c Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 24 Oct 2023 13:58:32 -0700
Subject: [PATCH 2/2] de-duplicate

---
 pandas/tests/io/excel/test_readers.py | 136 ++++++--------------------
 1 file changed, 28 insertions(+), 108 deletions(-)

diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 34fbd3d05fd7f..74fe5166df65f 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -121,6 +121,15 @@ def adjust_expected(expected: DataFrame, read_ext: str) -> None:
     expected.index.name = None
 
 
+def xfail_datetimes_with_pyxlsb(engine, request):
+    if engine == "pyxlsb":
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="Sheets containing datetimes not supported by pyxlsb"
+            )
+        )
+
+
 class TestReaders:
     @pytest.fixture(autouse=True)
     def cd_and_set_engine(self, engine, datapath, monkeypatch):
@@ -200,12 +209,7 @@ def test_usecols_int(self, read_ext):
             )
 
     def test_usecols_list(self, request, engine, read_ext, df_ref):
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         expected = df_ref[["B", "C"]]
         adjust_expected(expected, read_ext)
@@ -226,12 +230,7 @@ def test_usecols_list(self, request, engine, read_ext, df_ref):
         tm.assert_frame_equal(df2, expected)
 
     def test_usecols_str(self, request, engine, read_ext, df_ref):
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         expected = df_ref[["A", "B", "C"]]
         adjust_expected(expected, read_ext)
@@ -287,12 +286,7 @@ def test_usecols_str(self, request, engine, read_ext, df_ref):
     def test_usecols_diff_positional_int_columns_order(
         self, request, engine, read_ext, usecols, df_ref
     ):
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         expected = df_ref[["A", "C"]]
         adjust_expected(expected, read_ext)
@@ -311,12 +305,7 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r
         tm.assert_frame_equal(result, expected)
 
     def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref):
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         expected = df_ref
         adjust_expected(expected, read_ext)
@@ -325,12 +314,7 @@ def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref):
         tm.assert_frame_equal(result, expected)
 
     def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref):
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         expected = df_ref[["C", "D"]]
         adjust_expected(expected, read_ext)
@@ -415,12 +399,7 @@ def test_excel_stop_iterator(self, read_ext):
         tm.assert_frame_equal(parsed, expected)
 
     def test_excel_cell_error_na(self, request, engine, read_ext):
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         # https://github.com/tafia/calamine/issues/355
         if engine == "calamine" and read_ext == ".ods":
@@ -433,12 +412,7 @@ def test_excel_cell_error_na(self, request, engine, read_ext):
         tm.assert_frame_equal(parsed, expected)
 
     def test_excel_table(self, request, engine, read_ext, df_ref):
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         expected = df_ref
         adjust_expected(expected, read_ext)
@@ -457,12 +431,7 @@ def test_excel_table(self, request, engine, read_ext, df_ref):
         tm.assert_frame_equal(df3, df1.iloc[:-1])
 
     def test_reader_special_dtypes(self, request, engine, read_ext):
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         expected = DataFrame.from_dict(
             {
@@ -795,12 +764,7 @@ def test_exception_message_includes_sheet_name(self, read_ext):
     @pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl")
     def test_date_conversion_overflow(self, request, engine, read_ext):
         # GH 10001 : pandas.ExcelFile ignore parse_dates=False
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         expected = DataFrame(
             [
@@ -826,12 +790,8 @@ def test_date_conversion_overflow(self, request, engine, read_ext):
         tm.assert_frame_equal(result, expected)
 
     def test_sheet_name(self, request, read_ext, engine, df_ref):
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
+
         filename = "test1"
         sheet_name = "Sheet1"
 
@@ -995,12 +955,7 @@ def test_close_from_py_localpath(self, read_ext):
             f.read()
 
     def test_reader_seconds(self, request, engine, read_ext):
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         # GH 55045
         if engine == "calamine" and read_ext == ".ods":
@@ -1037,12 +992,7 @@ def test_reader_seconds(self, request, engine, read_ext):
 
     def test_read_excel_multiindex(self, request, engine, read_ext):
         # see gh-4679
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         # https://github.com/tafia/calamine/issues/354
         if engine == "calamine" and read_ext == ".ods":
@@ -1136,12 +1086,7 @@ def test_read_excel_multiindex_blank_after_name(
         self, request, engine, read_ext, sheet_name, idx_lvl2
     ):
         # GH34673
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb (GH4679"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         mi_file = "testmultiindex" + read_ext
         mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"])
@@ -1259,12 +1204,7 @@ def test_read_excel_bool_header_arg(self, read_ext):
 
     def test_read_excel_skiprows(self, request, engine, read_ext):
         # GH 4903
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         actual = pd.read_excel(
             "testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2]
@@ -1314,12 +1254,7 @@ def test_read_excel_skiprows(self, request, engine, read_ext):
 
     def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext):
         # GH 4903
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         actual = pd.read_excel(
             "testskiprows" + read_ext,
@@ -1586,12 +1521,7 @@ def test_excel_passes_na_filter(self, read_ext, na_filter):
         tm.assert_frame_equal(parsed, expected)
 
     def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref):
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         expected = df_ref
         adjust_expected(expected, read_ext)
@@ -1618,12 +1548,7 @@ def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref):
         tm.assert_frame_equal(df3, df1.iloc[:-1])
 
     def test_sheet_name(self, request, engine, read_ext, df_ref):
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         expected = df_ref
         adjust_expected(expected, read_ext)
@@ -1713,12 +1638,7 @@ def test_header_with_index_col(self, filename):
 
     def test_read_datetime_multiindex(self, request, engine, read_ext):
         # GH 34748
-        if engine == "pyxlsb":
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="Sheets containing datetimes not supported by pyxlsb"
-                )
-            )
+        xfail_datetimes_with_pyxlsb(engine, request)
 
         f = "test_datetime_mi" + read_ext
         with pd.ExcelFile(f) as excel: