From ff280485a24acbb5f466f4e29d19ab7bc3d02de2 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 28 Dec 2018 15:21:02 -0800 Subject: [PATCH] Decouple xlrd reading from ExcelFile class (#24423) --- pandas/io/excel.py | 242 +++++++++++++++++++--------------- pandas/tests/io/test_excel.py | 68 ++++++---- 2 files changed, 175 insertions(+), 135 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index c2c9a688f3f0a..9399f36072e5f 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -358,23 +358,16 @@ def read_excel(io, **kwds) -class ExcelFile(object): - """ - Class for parsing tabular excel sheets into DataFrame objects. - Uses xlrd. See read_excel for more documentation - - Parameters - ---------- - io : string, path object (pathlib.Path or py._path.local.LocalPath), - file-like object or xlrd workbook - If a string or path object, expected to be a path to xls or xlsx file - engine : string, default None - If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or xlrd - """ +class _XlrdReader(object): - def __init__(self, io, **kwds): + def __init__(self, filepath_or_buffer): + """Reader using xlrd engine. + Parameters + ---------- + filepath_or_buffer : string, path object or Workbook + Object to be parsed. + """ err_msg = "Install xlrd >= 1.0.0 for Excel support" try: @@ -386,46 +379,39 @@ def __init__(self, io, **kwds): raise ImportError(err_msg + ". Current version " + xlrd.__VERSION__) - # could be a str, ExcelFile, Book, etc. - self.io = io - # Always a string - self._io = _stringify_path(io) - - engine = kwds.pop('engine', None) - - if engine is not None and engine != 'xlrd': - raise ValueError("Unknown engine: {engine}".format(engine=engine)) - - # If io is a url, want to keep the data as bytes so can't pass - # to get_filepath_or_buffer() - if _is_url(self._io): - io = _urlopen(self._io) - elif not isinstance(self.io, (ExcelFile, xlrd.Book)): - io, _, _, _ = get_filepath_or_buffer(self._io) - - if engine == 'xlrd' and isinstance(io, xlrd.Book): - self.book = io - elif not isinstance(io, xlrd.Book) and hasattr(io, "read"): + # If filepath_or_buffer is a url, want to keep the data as bytes so + # can't pass to get_filepath_or_buffer() + if _is_url(filepath_or_buffer): + filepath_or_buffer = _urlopen(filepath_or_buffer) + elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): + filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer) + + if isinstance(filepath_or_buffer, xlrd.Book): + self.book = filepath_or_buffer + elif not isinstance(filepath_or_buffer, xlrd.Book) and hasattr( + filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too - if hasattr(io, 'seek'): + if hasattr(filepath_or_buffer, 'seek'): try: # GH 19779 - io.seek(0) + filepath_or_buffer.seek(0) except UnsupportedOperation: # HTTPResponse does not support seek() # GH 20434 pass - data = io.read() + data = filepath_or_buffer.read() self.book = xlrd.open_workbook(file_contents=data) - elif isinstance(self._io, compat.string_types): - self.book = xlrd.open_workbook(self._io) + elif isinstance(filepath_or_buffer, compat.string_types): + self.book = xlrd.open_workbook(filepath_or_buffer) else: raise ValueError('Must explicitly set engine if not passing in' ' buffer or path for io.') - def __fspath__(self): - return self._io + @property + def sheet_names(self): + return self.book.sheet_names() def parse(self, sheet_name=0, @@ -434,12 +420,13 @@ def parse(self, index_col=None, usecols=None, squeeze=False, - converters=None, + dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, + verbose=False, parse_dates=False, date_parser=None, thousands=None, @@ -448,72 +435,9 @@ def parse(self, convert_float=True, mangle_dupe_cols=True, **kwds): - """ - Parse specified sheet(s) into a DataFrame - - Equivalent to read_excel(ExcelFile, ...) See the read_excel - docstring for more info on accepted parameters - """ - - # Can't use _deprecate_kwarg since sheetname=None has a special meaning - if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: - warnings.warn("The `sheetname` keyword is deprecated, use " - "`sheet_name` instead", FutureWarning, stacklevel=2) - sheet_name = kwds.pop("sheetname") - elif 'sheetname' in kwds: - raise TypeError("Cannot specify both `sheet_name` " - "and `sheetname`. Use just `sheet_name`") - - return self._parse_excel(sheet_name=sheet_name, - header=header, - names=names, - index_col=index_col, - usecols=usecols, - squeeze=squeeze, - converters=converters, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - parse_dates=parse_dates, - date_parser=date_parser, - thousands=thousands, - comment=comment, - skipfooter=skipfooter, - convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, - **kwds) - - def _parse_excel(self, - sheet_name=0, - header=0, - names=None, - index_col=None, - usecols=None, - squeeze=False, - dtype=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, - na_values=None, - verbose=False, - parse_dates=False, - date_parser=None, - thousands=None, - comment=None, - skipfooter=0, - convert_float=True, - mangle_dupe_cols=True, - **kwds): _validate_header_arg(header) - if 'chunksize' in kwds: - raise NotImplementedError("chunksize keyword of read_excel " - "is not implemented") - from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) @@ -563,7 +487,7 @@ def _parse_cell(cell_contents, cell_typ): sheets = sheet_name ret_dict = True elif sheet_name is None: - sheets = self.sheet_names + sheets = self.book.sheet_names() ret_dict = True else: sheets = [sheet_name] @@ -678,9 +602,111 @@ def _parse_cell(cell_contents, cell_typ): else: return output[asheetname] + +class ExcelFile(object): + """ + Class for parsing tabular excel sheets into DataFrame objects. + Uses xlrd. See read_excel for more documentation + + Parameters + ---------- + io : string, path object (pathlib.Path or py._path.local.LocalPath), + file-like object or xlrd workbook + If a string or path object, expected to be a path to xls or xlsx file. + engine : string, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None or ``xlrd``. + """ + + _engines = { + 'xlrd': _XlrdReader, + } + + def __init__(self, io, engine=None): + if engine is None: + engine = 'xlrd' + if engine not in self._engines: + raise ValueError("Unknown engine: {engine}".format(engine=engine)) + + # could be a str, ExcelFile, Book, etc. + self.io = io + # Always a string + self._io = _stringify_path(io) + + self._reader = self._engines[engine](self._io) + + def __fspath__(self): + return self._io + + def parse(self, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + converters=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + parse_dates=False, + date_parser=None, + thousands=None, + comment=None, + skipfooter=0, + convert_float=True, + mangle_dupe_cols=True, + **kwds): + """ + Parse specified sheet(s) into a DataFrame + + Equivalent to read_excel(ExcelFile, ...) See the read_excel + docstring for more info on accepted parameters + """ + + # Can't use _deprecate_kwarg since sheetname=None has a special meaning + if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: + warnings.warn("The `sheetname` keyword is deprecated, use " + "`sheet_name` instead", FutureWarning, stacklevel=2) + sheet_name = kwds.pop("sheetname") + elif 'sheetname' in kwds: + raise TypeError("Cannot specify both `sheet_name` " + "and `sheetname`. Use just `sheet_name`") + + if 'chunksize' in kwds: + raise NotImplementedError("chunksize keyword of read_excel " + "is not implemented") + + return self._reader.parse(sheet_name=sheet_name, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + squeeze=squeeze, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, + **kwds) + + @property + def book(self): + return self._reader.book + @property def sheet_names(self): - return self.book.sheet_names() + return self._reader.sheet_names def close(self): """close io if necessary""" diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 033d600ffc09b..84383afed1d03 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -119,6 +119,15 @@ def get_exceldf(self, basename, ext, *args, **kwds): class ReadingTestsBase(SharedItems): # This is based on ExcelWriterBase + @pytest.fixture(autouse=True, params=['xlrd', None]) + def set_engine(self, request): + func_name = "get_exceldf" + old_func = getattr(self, func_name) + new_func = partial(old_func, engine=request.param) + setattr(self, func_name, new_func) + yield + setattr(self, func_name, old_func) + @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 def test_usecols_int(self, ext): @@ -674,14 +683,6 @@ def test_sheet_name_both_raises(self, ext): excel.parse(sheetname='Sheet1', sheet_name='Sheet1') - -@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) -class TestXlrdReader(ReadingTestsBase): - """ - This is the base class for the xlrd tests, and 3 different file formats - are supported: xls, xlsx, xlsm - """ - def test_excel_read_buffer(self, ext): pth = os.path.join(self.dirpath, 'test1' + ext) @@ -695,25 +696,10 @@ def test_excel_read_buffer(self, ext): actual = read_excel(xls, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) - @td.skip_if_no("xlwt") - def test_read_xlrd_book(self, ext): - import xlrd - df = self.frame - - engine = "xlrd" - sheet_name = "SheetA" - - with ensure_clean(ext) as pth: - df.to_excel(pth, sheet_name) - book = xlrd.open_workbook(pth) - - with ExcelFile(book, engine=engine) as xl: - result = read_excel(xl, sheet_name, index_col=0) - tm.assert_frame_equal(df, result) - - result = read_excel(book, sheet_name=sheet_name, - engine=engine, index_col=0) - tm.assert_frame_equal(df, result) + def test_bad_engine_raises(self, ext): + bad_engine = 'foo' + with pytest.raises(ValueError, message="Unknown engine: foo"): + read_excel('', engine=bad_engine) @tm.network def test_read_from_http_url(self, ext): @@ -1156,6 +1142,34 @@ def test_read_excel_squeeze(self, ext): tm.assert_series_equal(actual, expected) +@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) +class TestXlrdReader(ReadingTestsBase): + """ + This is the base class for the xlrd tests, and 3 different file formats + are supported: xls, xlsx, xlsm + """ + + @td.skip_if_no("xlwt") + def test_read_xlrd_book(self, ext): + import xlrd + df = self.frame + + engine = "xlrd" + sheet_name = "SheetA" + + with ensure_clean(ext) as pth: + df.to_excel(pth, sheet_name) + book = xlrd.open_workbook(pth) + + with ExcelFile(book, engine=engine) as xl: + result = read_excel(xl, sheet_name, index_col=0) + tm.assert_frame_equal(df, result) + + result = read_excel(book, sheet_name=sheet_name, + engine=engine, index_col=0) + tm.assert_frame_equal(df, result) + + class _WriterBase(SharedItems): @pytest.fixture(autouse=True)