From 3ebbc772cab1e0fec084f498fd9f758057d6871c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 8 Feb 2021 06:43:12 +0000 Subject: [PATCH] Backport PR #39586: REG: read_excel with engine specified raises on non-path/non-buffer (#39652) Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v1.2.2.rst | 1 + pandas/io/excel/_base.py | 31 +++++++++++++++++-------- pandas/io/excel/_openpyxl.py | 8 +++++-- pandas/tests/io/excel/test_openpyxl.py | 32 +++++++++++++++++++++----- pandas/tests/io/excel/test_readers.py | 9 +++++++- 5 files changed, 62 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index b2b7326b9cb04..4f80263fda92d 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -23,6 +23,7 @@ Fixed regressions - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`) - Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`) - Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`) +- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 7d64ab77c962d..850570fc743b7 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1062,14 +1062,16 @@ def __init__( xlrd_version = LooseVersion(get_version(xlrd)) - if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): - ext = "xls" - else: - ext = inspect_excel_format( - content=path_or_buffer, storage_options=storage_options - ) - + ext = None if engine is None: + # Only determine ext if it is needed + if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + content=path_or_buffer, storage_options=storage_options + ) + if ext == "ods": engine = "odf" elif ext == "xls": @@ -1086,13 +1088,22 @@ def __init__( else: engine = "xlrd" - if engine == "xlrd" and ext != "xls" and xlrd_version is not None: - if xlrd_version >= "2": + if engine == "xlrd" and xlrd_version is not None: + if ext is None: + # Need ext to determine ext in order to raise/warn + if isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + content=path_or_buffer, storage_options=storage_options + ) + + if ext != "xls" and xlrd_version >= "2": raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) - else: + elif ext != "xls": caller = inspect.stack()[1] if ( caller.filename.endswith( diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 4f02aff2eb992..0f519c13f98e7 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -531,7 +531,11 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: version = LooseVersion(get_version(openpyxl)) - if version >= "3.0.0": + # There is no good way of determining if a sheet is read-only + # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605 + is_readonly = hasattr(sheet, "reset_dimensions") + + if version >= "3.0.0" and is_readonly: sheet.reset_dimensions() data: List[List[Scalar]] = [] @@ -539,7 +543,7 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: converted_row = [self._convert_cell(cell, convert_float) for cell in row] data.append(converted_row) - if version >= "3.0.0" and len(data) > 0: + if version >= "3.0.0" and is_readonly and len(data) > 0: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 640501baffc62..da12829b579fe 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -122,6 +122,17 @@ def test_to_excel_with_openpyxl_engine(ext): styled.to_excel(filename, engine="openpyxl") +@pytest.mark.parametrize("read_only", [True, False]) +def test_read_workbook(datapath, ext, read_only): + # GH 39528 + filename = datapath("io", "data", "excel", "test1" + ext) + wb = openpyxl.load_workbook(filename, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = pd.read_excel(filename) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "header, expected_data", [ @@ -139,13 +150,22 @@ def test_to_excel_with_openpyxl_engine(ext): @pytest.mark.parametrize( "filename", ["dimension_missing", "dimension_small", "dimension_large"] ) -@pytest.mark.xfail( - LooseVersion(get_version(openpyxl)) < "3.0.0", - reason="openpyxl read-only sheet is incorrect when dimension data is wrong", -) -def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename): +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_with_bad_dimension( + datapath, ext, header, expected_data, filename, read_only, request +): # GH 38956, 39001 - no/incorrect dimension information + version = LooseVersion(get_version(openpyxl)) + if (read_only or read_only is None) and version < "3.0.0": + msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" + request.node.add_marker(pytest.mark.xfail(reason=msg)) path = datapath("io", "data", "excel", f"{filename}{ext}") - result = pd.read_excel(path, header=header) + if read_only is None: + result = pd.read_excel(path, header=header) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl", header=header) + wb.close() expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 9b3d359dc01a5..bd3bfa207c4b0 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -2,6 +2,7 @@ from functools import partial import os from urllib.error import URLError +from zipfile import BadZipFile import numpy as np import pytest @@ -642,7 +643,13 @@ def test_missing_file_raises(self, read_ext): def test_corrupt_bytes_raises(self, read_ext, engine): bad_stream = b"foo" - with pytest.raises(ValueError, match="File is not a recognized excel file"): + if engine is None or engine == "xlrd": + error = ValueError + msg = "File is not a recognized excel file" + else: + error = BadZipFile + msg = "File is not a zip file" + with pytest.raises(error, match=msg): pd.read_excel(bad_stream) @tm.network