From 9019904bdf7a300d1c52408c2d5a8b8d81cc3ef6 Mon Sep 17 00:00:00 2001 From: Dmitriy <3132181+dimastbk@users.noreply.github.com> Date: Thu, 26 Jan 2023 13:03:06 +0600 Subject: [PATCH 1/2] added conversion date/time/float, support file_rows_needed, fixed support s3object?, more accuracy xfail in tests --- pandas/io/excel/_calaminereader.py | 65 +++++++--- pandas/tests/io/excel/test_readers.py | 180 +++++++++++++++++++++----- 2 files changed, 192 insertions(+), 53 deletions(-) diff --git a/pandas/io/excel/_calaminereader.py b/pandas/io/excel/_calaminereader.py index d107c1d89cc39..af7d2016648c6 100644 --- a/pandas/io/excel/_calaminereader.py +++ b/pandas/io/excel/_calaminereader.py @@ -1,23 +1,31 @@ from __future__ import annotations -from io import ( - BufferedReader, - BytesIO, +from datetime import ( + date, + datetime, + time, ) -from pathlib import PurePath from tempfile import NamedTemporaryFile +from typing import Union from pandas._typing import ( + FilePath, + ReadBuffer, Scalar, StorageOptions, ) from pandas.compat._optional import import_optional_dependency +import pandas as pd + +from pandas.io.common import stringify_path from pandas.io.excel._base import ( BaseExcelReader, inspect_excel_format, ) +ValueT = Union[int, float, str, bool, time, date, datetime] + class __calamine__: pass @@ -28,7 +36,9 @@ class CalamineExcelReader(BaseExcelReader): _sheet_names: list[str] | None = None def __init__( - self, filepath_or_buffer, storage_options: StorageOptions = None + self, + filepath_or_buffer: FilePath | ReadBuffer[bytes], + storage_options: StorageOptions = None, ) -> None: import_optional_dependency("python_calamine") super().__init__(filepath_or_buffer, storage_options=storage_options) @@ -37,20 +47,15 @@ def __init__( def _workbook_class(self) -> type[__calamine__]: return __calamine__ - def load_workbook( - self, filepath_or_buffer: str | PurePath | BufferedReader | BytesIO - ) -> str: - if isinstance(filepath_or_buffer, BufferedReader): - filepath_or_buffer = filepath_or_buffer.name - - elif isinstance(filepath_or_buffer, BytesIO): + def load_workbook(self, filepath_or_buffer) -> str: + if hasattr(filepath_or_buffer, "read") and hasattr(filepath_or_buffer, "seek"): ext = inspect_excel_format(filepath_or_buffer) with NamedTemporaryFile(suffix=f".{ext}", delete=False) as tmp_file: - tmp_file.write(filepath_or_buffer.getvalue()) + filepath_or_buffer.seek(0) + tmp_file.write(filepath_or_buffer.read()) filepath_or_buffer = tmp_file.name - - elif isinstance(filepath_or_buffer, PurePath): - filepath_or_buffer = filepath_or_buffer.as_posix() + else: + filepath_or_buffer = stringify_path(filepath_or_buffer) assert isinstance(filepath_or_buffer, str) @@ -75,7 +80,31 @@ def get_sheet_by_index(self, index: int) -> int: self.raise_if_bad_sheet_by_index(index) return index - def get_sheet_data(self, sheet: int, convert_float: bool) -> list[list[Scalar]]: + def get_sheet_data( + self, sheet: int, file_rows_needed: int | None = None + ) -> list[list[Scalar]]: + def _convert_cell(value: ValueT) -> Scalar: + if isinstance(value, float): + val = int(value) + if val == value: + return val + else: + return value + elif isinstance(value, date): + return pd.Timestamp(value) + elif isinstance(value, time): + return value.isoformat() + + return value + from python_calamine import get_sheet_data - return get_sheet_data(self.book, sheet) + rows = get_sheet_data(self.book, sheet) + data: list[list[Scalar]] = [] + + for row in rows: + data.append([_convert_cell(cell) for cell in row]) + if file_rows_needed is not None and len(data) >= file_rows_needed: + break + + return data diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 87191f451fb29..4e408a72db7c6 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -168,10 +168,16 @@ def test_usecols_int(self, read_ext): ) def test_usecols_list(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -192,10 +198,16 @@ def test_usecols_list(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(df2, df_ref, check_names=False) def test_usecols_str(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -250,10 +262,16 @@ def test_usecols_str(self, request, engine, read_ext, df_ref): def test_usecols_diff_positional_int_columns_order( self, request, engine, read_ext, usecols, df_ref ): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -272,10 +290,16 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r tm.assert_frame_equal(result, expected, check_names=False) def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -284,10 +308,16 @@ def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(result, expected, check_names=False) def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -364,10 +394,16 @@ def test_excel_stop_iterator(self, read_ext): tm.assert_frame_equal(parsed, expected) def test_excel_cell_error_na(self, request, engine, read_ext): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -376,10 +412,16 @@ def test_excel_cell_error_na(self, request, engine, read_ext): tm.assert_frame_equal(parsed, expected) def test_excel_table(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -397,10 +439,16 @@ def test_excel_table(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_reader_special_dtypes(self, request, engine, read_ext): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -745,10 +793,16 @@ def test_exception_message_includes_sheet_name(self, read_ext): @pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl") def test_date_conversion_overflow(self, request, engine, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -776,12 +830,19 @@ def test_date_conversion_overflow(self, request, engine, read_ext): tm.assert_frame_equal(result, expected) def test_sheet_name(self, request, read_ext, engine, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" + ) + ) + filename = "test1" sheet_name = "Sheet1" @@ -950,10 +1011,16 @@ def test_close_from_py_localpath(self, read_ext): f.read() def test_reader_seconds(self, request, engine, read_ext): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -984,10 +1051,16 @@ def test_reader_seconds(self, request, engine, read_ext): def test_read_excel_multiindex(self, request, engine, read_ext): # see gh-4679 - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -1079,13 +1152,19 @@ def test_read_excel_multiindex_blank_after_name( self, request, engine, read_ext, sheet_name, idx_lvl2 ): # GH34673 - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes" f"not supported by {engine} (GH4679)" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" + ) + ) mi_file = "testmultiindex" + read_ext mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) @@ -1202,10 +1281,16 @@ def test_read_excel_bool_header_arg(self, read_ext): def test_read_excel_skiprows(self, request, engine, read_ext): # GH 4903 - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -1257,10 +1342,16 @@ def test_read_excel_skiprows(self, request, engine, read_ext): def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): # GH 4903 - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -1517,10 +1608,16 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): tm.assert_frame_equal(parsed, expected) def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -1546,10 +1643,16 @@ def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_sheet_name(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -1638,10 +1741,16 @@ def test_header_with_index_col(self, filename): def test_read_datetime_multiindex(self, request, engine, read_ext): # GH 34748 - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -1699,3 +1808,4 @@ def test_corrupt_files_closed(self, engine, read_ext): pd.ExcelFile(file, engine=engine) except errors: pass +ф \ No newline at end of file From 08a561639e2e63bb7e570066a84ddd15e6980ea2 Mon Sep 17 00:00:00 2001 From: Dmitriy <3132181+dimastbk@users.noreply.github.com> Date: Thu, 26 Jan 2023 13:23:19 +0600 Subject: [PATCH 2/2] Update test_readers.py --- pandas/tests/io/excel/test_readers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 4e408a72db7c6..cec4e636db400 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1808,4 +1808,3 @@ def test_corrupt_files_closed(self, engine, read_ext): pd.ExcelFile(file, engine=engine) except errors: pass -ф \ No newline at end of file