Skip to content

Commit

Permalink
Backport PR #39586: REG: read_excel with engine specified raises on n…
Browse files Browse the repository at this point in the history
…on-path/non-buffer (#39652)

Co-authored-by: Richard Shadrach <[email protected]>
  • Loading branch information
simonjayhawkins and rhshadrach authored Feb 8, 2021
1 parent b206877 commit 3ebbc77
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 19 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Fixed regressions
- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
- Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`)
- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)
-

.. ---------------------------------------------------------------------------
Expand Down
31 changes: 21 additions & 10 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,14 +1062,16 @@ def __init__(

xlrd_version = LooseVersion(get_version(xlrd))

if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
ext = "xls"
else:
ext = inspect_excel_format(
content=path_or_buffer, storage_options=storage_options
)

ext = None
if engine is None:
# Only determine ext if it is needed
if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
ext = "xls"
else:
ext = inspect_excel_format(
content=path_or_buffer, storage_options=storage_options
)

if ext == "ods":
engine = "odf"
elif ext == "xls":
Expand All @@ -1086,13 +1088,22 @@ def __init__(
else:
engine = "xlrd"

if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
if xlrd_version >= "2":
if engine == "xlrd" and xlrd_version is not None:
if ext is None:
# Need ext to determine ext in order to raise/warn
if isinstance(path_or_buffer, xlrd.Book):
ext = "xls"
else:
ext = inspect_excel_format(
content=path_or_buffer, storage_options=storage_options
)

if ext != "xls" and xlrd_version >= "2":
raise ValueError(
f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
f"only the xls format is supported. Install openpyxl instead."
)
else:
elif ext != "xls":
caller = inspect.stack()[1]
if (
caller.filename.endswith(
Expand Down
8 changes: 6 additions & 2 deletions pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,15 +531,19 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:

version = LooseVersion(get_version(openpyxl))

if version >= "3.0.0":
# There is no good way of determining if a sheet is read-only
# https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605
is_readonly = hasattr(sheet, "reset_dimensions")

if version >= "3.0.0" and is_readonly:
sheet.reset_dimensions()

data: List[List[Scalar]] = []
for row_number, row in enumerate(sheet.rows):
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
data.append(converted_row)

if version >= "3.0.0" and len(data) > 0:
if version >= "3.0.0" and is_readonly and len(data) > 0:
# With dimension reset, openpyxl no longer pads rows
max_width = max(len(data_row) for data_row in data)
if min(len(data_row) for data_row in data) < max_width:
Expand Down
32 changes: 26 additions & 6 deletions pandas/tests/io/excel/test_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,17 @@ def test_to_excel_with_openpyxl_engine(ext):
styled.to_excel(filename, engine="openpyxl")


@pytest.mark.parametrize("read_only", [True, False])
def test_read_workbook(datapath, ext, read_only):
# GH 39528
filename = datapath("io", "data", "excel", "test1" + ext)
wb = openpyxl.load_workbook(filename, read_only=read_only)
result = pd.read_excel(wb, engine="openpyxl")
wb.close()
expected = pd.read_excel(filename)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"header, expected_data",
[
Expand All @@ -139,13 +150,22 @@ def test_to_excel_with_openpyxl_engine(ext):
@pytest.mark.parametrize(
"filename", ["dimension_missing", "dimension_small", "dimension_large"]
)
@pytest.mark.xfail(
LooseVersion(get_version(openpyxl)) < "3.0.0",
reason="openpyxl read-only sheet is incorrect when dimension data is wrong",
)
def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename):
# When read_only is None, use read_excel instead of a workbook
@pytest.mark.parametrize("read_only", [True, False, None])
def test_read_with_bad_dimension(
datapath, ext, header, expected_data, filename, read_only, request
):
# GH 38956, 39001 - no/incorrect dimension information
version = LooseVersion(get_version(openpyxl))
if (read_only or read_only is None) and version < "3.0.0":
msg = "openpyxl read-only sheet is incorrect when dimension data is wrong"
request.node.add_marker(pytest.mark.xfail(reason=msg))
path = datapath("io", "data", "excel", f"{filename}{ext}")
result = pd.read_excel(path, header=header)
if read_only is None:
result = pd.read_excel(path, header=header)
else:
wb = openpyxl.load_workbook(path, read_only=read_only)
result = pd.read_excel(wb, engine="openpyxl", header=header)
wb.close()
expected = DataFrame(expected_data)
tm.assert_frame_equal(result, expected)
9 changes: 8 additions & 1 deletion pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from functools import partial
import os
from urllib.error import URLError
from zipfile import BadZipFile

import numpy as np
import pytest
Expand Down Expand Up @@ -642,7 +643,13 @@ def test_missing_file_raises(self, read_ext):

def test_corrupt_bytes_raises(self, read_ext, engine):
bad_stream = b"foo"
with pytest.raises(ValueError, match="File is not a recognized excel file"):
if engine is None or engine == "xlrd":
error = ValueError
msg = "File is not a recognized excel file"
else:
error = BadZipFile
msg = "File is not a zip file"
with pytest.raises(error, match=msg):
pd.read_excel(bad_stream)

@tm.network
Expand Down

0 comments on commit 3ebbc77

Please sign in to comment.