diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 8406c6b567430..193f837400ac0 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -279,7 +279,6 @@ Other enhancements - :meth:`Series.dropna` and :meth:`DataFrame.dropna` has gained ``ignore_index`` keyword to reset index (:issue:`31725`) - Improved error message in :func:`to_datetime` for non-ISO8601 formats, informing users about the position of the first error (:issue:`50361`) - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`) -- Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) - Added support for :meth:`Index.min` and :meth:`Index.max` for pyarrow string dtypes (:issue:`51397`) - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`) - Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 30d3445663b2d..2d06faaa0e36a 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -42,8 +42,8 @@ Other enhancements - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) - :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). -- Added ``calamine`` as an engine to ``read_excel`` (:issue:`50395`) - :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter (:issue:`44279`) +- Added ``calamine`` as an engine to ``read_excel`` (:issue:`50395`) - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index c36edcaf493ed..fb291d53f1556 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1456,7 +1456,7 @@ class ExcelFile: This is not supported, switch to using ``openpyxl`` instead. """ - from pandas.io.excel._calaminereader import CalamineExcelReader + from pandas.io.excel._calamine import CalamineReader from pandas.io.excel._odfreader import ODFReader from pandas.io.excel._openpyxl import OpenpyxlReader from pandas.io.excel._pyxlsb import PyxlsbReader @@ -1467,7 +1467,7 @@ class ExcelFile: "openpyxl": OpenpyxlReader, "odf": ODFReader, "pyxlsb": PyxlsbReader, - "calamine": CalamineExcelReader, + "calamine": CalamineReader, } def __init__( diff --git a/pandas/io/excel/_calaminereader.py b/pandas/io/excel/_calamine.py similarity index 98% rename from pandas/io/excel/_calaminereader.py rename to pandas/io/excel/_calamine.py index 27703f76d669a..c71c0c62ff682 100644 --- a/pandas/io/excel/_calaminereader.py +++ b/pandas/io/excel/_calamine.py @@ -29,7 +29,7 @@ _CellValueT = Union[int, float, str, bool, time, date, datetime] -class CalamineExcelReader(BaseExcelReader): +class CalamineReader(BaseExcelReader): @doc(storage_options=_shared_docs["storage_options"]) def __init__( self, diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 4c171c43843ef..76df9c21424e8 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -451,6 +451,10 @@ def test_reader_special_dtypes(self, request, engine, read_ext): reason="Calamine support parsing datetime only in xlsx" ) ) + if engine == "calamine": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine can't parse this datetime format") + ) expected = DataFrame.from_dict( { @@ -584,11 +588,16 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) - def test_dtype_backend(self, read_ext, dtype_backend): + def test_dtype_backend(self, request, engine, read_ext, dtype_backend): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine doesn't support invalid ods") + ) + df = DataFrame( { "a": Series([1, 3], dtype="Int64"), @@ -629,11 +638,16 @@ def test_dtype_backend(self, read_ext, dtype_backend): expected = df tm.assert_frame_equal(result, expected) - def test_dtype_backend_and_dtype(self, read_ext): + def test_dtype_backend_and_dtype(self, request, engine, read_ext): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine doesn't support invalid ods") + ) + df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]}) with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, "test", index=False) @@ -646,11 +660,16 @@ def test_dtype_backend_and_dtype(self, read_ext): tm.assert_frame_equal(result, df) @td.skip_if_no("pyarrow") - def test_dtype_backend_string(self, read_ext, string_storage): + def test_dtype_backend_string(self, request, engine, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine doesn't support invalid ods") + ) + import pyarrow as pa with pd.option_context("mode.string_storage", string_storage): @@ -694,8 +713,15 @@ def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): assert dtype_dict == dtype_dict_copy, "dtype dict changed" tm.assert_frame_equal(result, expected) - def test_reader_spaces(self, read_ext): + def test_reader_spaces(self, request, engine, read_ext): # see gh-32207 + + # https://github.com/tafia/calamine/pull/289 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine doesn't respect spaces in ods") + ) + basename = "test_spaces" actual = pd.read_excel(basename + read_ext) @@ -790,12 +816,6 @@ def test_date_conversion_overflow(self, request, engine, read_ext): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: - request.node.add_marker( - pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" - ) - ) expected = DataFrame( [ @@ -806,6 +826,11 @@ def test_date_conversion_overflow(self, request, engine, read_ext): columns=["DateColWithBigInt", "StringCol"], ) + if engine == "calamine": + request.node.add_marker( + pytest.mark.xfail(reason="Maybe not supported by calamine") + ) + if engine == "openpyxl": request.node.add_marker( pytest.mark.xfail(reason="Maybe not supported by openpyxl") @@ -1008,6 +1033,12 @@ def test_reader_seconds(self, request, engine, read_ext): reason="Calamine support parsing datetime only in xlsx" ) ) + if engine == "calamine": + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine doesn't support parsing milliseconds in datetime" + ) + ) # Test reading times with and without milliseconds. GH5945. expected = DataFrame.from_dict( @@ -1174,10 +1205,17 @@ def test_read_excel_multiindex_blank_after_name( ) tm.assert_frame_equal(result, expected) - def test_read_excel_multiindex_header_only(self, read_ext): + def test_read_excel_multiindex_header_only(self, request, engine, read_ext): # see gh-11733. # # Don't try to parse a header name if there isn't one. + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine doesn't support 'number-rows-repeated' in ods" + ) + ) + mi_file = "testmultiindex" + read_ext result = pd.read_excel(mi_file, sheet_name="index_col_none", header=[0, 1]) @@ -1418,8 +1456,15 @@ def test_deprecated_kwargs(self, read_ext): with pytest.raises(TypeError, match="but 3 positional arguments"): pd.read_excel("test1" + read_ext, "Sheet1", 0) - def test_no_header_with_list_index_col(self, read_ext): + def test_no_header_with_list_index_col(self, request, engine, read_ext): # GH 31783 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine doesn't support 'number-rows-repeated' in ods" + ) + ) + file_name = "testmultiindex" + read_ext data = [("B", "B"), ("key", "val"), (3, 4), (3, 4)] idx = MultiIndex.from_tuples( @@ -1439,8 +1484,15 @@ def test_one_col_noskip_blank_line(self, read_ext): result = pd.read_excel(file_name) tm.assert_frame_equal(result, expected) - def test_multiheader_two_blank_lines(self, read_ext): + def test_multiheader_two_blank_lines(self, request, engine, read_ext): # GH 40442 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine doesn't support 'number-rows-repeated' in ods" + ) + ) + file_name = "testmultiindex" + read_ext columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")]) data = [[np.nan, np.nan], [np.nan, np.nan], [1, 3], [2, 4]] @@ -1703,7 +1755,7 @@ def test_excel_read_binary(self, engine, read_ext): def test_excel_read_binary_via_read_excel(self, read_ext, engine): # GH 38424 with open("test1" + read_ext, "rb") as f: - result = pd.read_excel(f) + result = pd.read_excel(f, engine=engine) expected = pd.read_excel("test1" + read_ext, engine=engine) tm.assert_frame_equal(result, expected)