Backport PR #39586: REG: read_excel with engine specified raises on n…

…on-path/non-buffer (#39652) Co-authored-by: Richard Shadrach <[email protected]>
pandas-dev · Feb 8, 2021 · 3ebbc77 · 3ebbc77
1 parent b206877
commit 3ebbc77
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 19 deletions.
diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst
@@ -23,6 +23,7 @@ Fixed regressions
 - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
 - Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`)
 - Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
+- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -1062,14 +1062,16 @@ def __init__(
 
             xlrd_version = LooseVersion(get_version(xlrd))
 
-        if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
-            ext = "xls"
-        else:
-            ext = inspect_excel_format(
-                content=path_or_buffer, storage_options=storage_options
-            )
-
+        ext = None
         if engine is None:
+            # Only determine ext if it is needed
+            if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
+                ext = "xls"
+            else:
+                ext = inspect_excel_format(
+                    content=path_or_buffer, storage_options=storage_options
+                )
+
             if ext == "ods":
                 engine = "odf"
             elif ext == "xls":
@@ -1086,13 +1088,22 @@ def __init__(
                 else:
                     engine = "xlrd"
 
-        if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
-            if xlrd_version >= "2":
+        if engine == "xlrd" and xlrd_version is not None:
+            if ext is None:
+                # Need ext to determine ext in order to raise/warn
+                if isinstance(path_or_buffer, xlrd.Book):
+                    ext = "xls"
+                else:
+                    ext = inspect_excel_format(
+                        content=path_or_buffer, storage_options=storage_options
+                    )
+
+            if ext != "xls" and xlrd_version >= "2":
                 raise ValueError(
                     f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                     f"only the xls format is supported. Install openpyxl instead."
                 )
-            else:
+            elif ext != "xls":
                 caller = inspect.stack()[1]
                 if (
                     caller.filename.endswith(

diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
@@ -531,15 +531,19 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
 
         version = LooseVersion(get_version(openpyxl))
 
-        if version >= "3.0.0":
+        # There is no good way of determining if a sheet is read-only
+        # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605
+        is_readonly = hasattr(sheet, "reset_dimensions")
+
+        if version >= "3.0.0" and is_readonly:
             sheet.reset_dimensions()
 
         data: List[List[Scalar]] = []
         for row_number, row in enumerate(sheet.rows):
             converted_row = [self._convert_cell(cell, convert_float) for cell in row]
             data.append(converted_row)
 
-        if version >= "3.0.0" and len(data) > 0:
+        if version >= "3.0.0" and is_readonly and len(data) > 0:
             # With dimension reset, openpyxl no longer pads rows
             max_width = max(len(data_row) for data_row in data)
             if min(len(data_row) for data_row in data) < max_width:

diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py
@@ -122,6 +122,17 @@ def test_to_excel_with_openpyxl_engine(ext):
         styled.to_excel(filename, engine="openpyxl")
 
 
+@pytest.mark.parametrize("read_only", [True, False])
+def test_read_workbook(datapath, ext, read_only):
+    # GH 39528
+    filename = datapath("io", "data", "excel", "test1" + ext)
+    wb = openpyxl.load_workbook(filename, read_only=read_only)
+    result = pd.read_excel(wb, engine="openpyxl")
+    wb.close()
+    expected = pd.read_excel(filename)
+    tm.assert_frame_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "header, expected_data",
     [
@@ -139,13 +150,22 @@ def test_to_excel_with_openpyxl_engine(ext):
 @pytest.mark.parametrize(
     "filename", ["dimension_missing", "dimension_small", "dimension_large"]
 )
-@pytest.mark.xfail(
-    LooseVersion(get_version(openpyxl)) < "3.0.0",
-    reason="openpyxl read-only sheet is incorrect when dimension data is wrong",
-)
-def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename):
+# When read_only is None, use read_excel instead of a workbook
+@pytest.mark.parametrize("read_only", [True, False, None])
+def test_read_with_bad_dimension(
+    datapath, ext, header, expected_data, filename, read_only, request
+):
     # GH 38956, 39001 - no/incorrect dimension information
+    version = LooseVersion(get_version(openpyxl))
+    if (read_only or read_only is None) and version < "3.0.0":
+        msg = "openpyxl read-only sheet is incorrect when dimension data is wrong"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
     path = datapath("io", "data", "excel", f"{filename}{ext}")
-    result = pd.read_excel(path, header=header)
+    if read_only is None:
+        result = pd.read_excel(path, header=header)
+    else:
+        wb = openpyxl.load_workbook(path, read_only=read_only)
+        result = pd.read_excel(wb, engine="openpyxl", header=header)
+        wb.close()
     expected = DataFrame(expected_data)
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -2,6 +2,7 @@
 from functools import partial
 import os
 from urllib.error import URLError
+from zipfile import BadZipFile
 
 import numpy as np
 import pytest
@@ -642,7 +643,13 @@ def test_missing_file_raises(self, read_ext):
 
     def test_corrupt_bytes_raises(self, read_ext, engine):
         bad_stream = b"foo"
-        with pytest.raises(ValueError, match="File is not a recognized excel file"):
+        if engine is None or engine == "xlrd":
+            error = ValueError
+            msg = "File is not a recognized excel file"
+        else:
+            error = BadZipFile
+            msg = "File is not a zip file"
+        with pytest.raises(error, match=msg):
             pd.read_excel(bad_stream)
 
     @tm.network