From 51f2930c7542f5d7a6b01d563ce3f5ff9de13e5d Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Fri, 30 Oct 2020 11:11:49 +0300 Subject: [PATCH] FIX-#1927: Fix performance issue related to `sparse` attribute access Signed-off-by: Igoshev, Yaroslav --- modin/pandas/accessor.py | 111 ++++++++++++++++++++ modin/pandas/dataframe.py | 5 +- modin/pandas/series.py | 5 +- modin/pandas/test/dataframe/test_default.py | 13 ++- modin/pandas/test/test_api.py | 1 + modin/pandas/test/test_series.py | 23 ++-- 6 files changed, 138 insertions(+), 20 deletions(-) create mode 100644 modin/pandas/accessor.py diff --git a/modin/pandas/accessor.py b/modin/pandas/accessor.py new file mode 100644 index 00000000000..b4895b7eabc --- /dev/null +++ b/modin/pandas/accessor.py @@ -0,0 +1,111 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pandas +from pandas.core.arrays.sparse.dtype import SparseDtype + +from modin.utils import _inherit_docstrings + + +class BaseSparseAccessor: + _validation_msg = "Can only use the '.sparse' accessor with Sparse data." + + def __init__(self, data=None): + self._parent = data + self._validate(data) + + def _validate(self, data): + raise NotImplementedError + + def _default_to_pandas(self, op, *args, **kwargs): + return self._parent._default_to_pandas( + lambda parent: op(parent.sparse, *args, **kwargs) + ) + + +@_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseFrameAccessor) +class SparseFrameAccessor(BaseSparseAccessor): + def _validate(self, data): + dtypes = data.dtypes + if not all(isinstance(t, SparseDtype) for t in dtypes): + raise AttributeError(self._validation_msg) + + @property + def density(self): + return self._parent._default_to_pandas(pandas.DataFrame.sparse).density + + @classmethod + def from_spmatrix(cls, data, index=None, columns=None): + return cls._default_to_pandas( + pandas.DataFrame.sparse.from_spmatrix, data, index=index, columns=columns + ) + + def to_dense(self): + return self._default_to_pandas(pandas.DataFrame.sparse.to_dense) + + def to_coo(self): + return self._default_to_pandas(pandas.DataFrame.sparse.to_coo) + + +@_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseAccessor) +class SparseAccessor(BaseSparseAccessor): + def _validate(self, data): + if not isinstance(data.dtype, SparseDtype): + raise AttributeError(self._validation_msg) + + @property + def density(self): + return self._parent._default_to_pandas(pandas.Series.sparse).density + + @property + def fill_value(self): + return self._parent._default_to_pandas(pandas.Series.sparse).fill_value + + @property + def npoints(self): + return self._parent._default_to_pandas(pandas.Series.sparse).npoints + + @property + def sp_values(self): + return self._parent._default_to_pandas(pandas.Series.sparse).sp_values + + @classmethod + def from_coo(cls, A, dense_index=False): + return cls._default_to_pandas( + pandas.Series.sparse.from_coo, A, dense_index=dense_index + ) + + def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): + return self._default_to_pandas( + pandas.Series.sparse.to_coo, + row_levels=row_levels, + column_levels=column_levels, + sort_labels=sort_labels, + ) + + def to_dense(self): + return self._default_to_pandas(pandas.Series.sparse.to_dense) + + +@_inherit_docstrings(pandas.core.accessor.CachedAccessor) +class CachedAccessor: + def __init__(self, name: str, accessor) -> None: + self._name = name + self._accessor = accessor + + def __get__(self, obj, cls): + if obj is None: + return self._accessor + accessor_obj = self._accessor(obj) + object.__setattr__(obj, self._name, accessor_obj) + return accessor_obj diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 2d7ee67161e..4c6e6af9846 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -54,6 +54,7 @@ from .series import Series from .base import BasePandasDataset, _ATTRS_NO_LOOKUP from .groupby import DataFrameGroupBy +from .accessor import CachedAccessor, SparseFrameAccessor @_inherit_docstrings(pandas.DataFrame, excluded=[pandas.DataFrame.__init__]) @@ -1594,9 +1595,7 @@ def set_index( if not inplace: return frame - @property - def sparse(self): - return self._default_to_pandas(pandas.DataFrame.sparse) + sparse = CachedAccessor("sparse", SparseFrameAccessor) def squeeze(self, axis=None): axis = self._get_axis_number(axis) if axis is not None else None diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 6a1e11e4929..c3833cfe64a 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -41,6 +41,7 @@ from .base import BasePandasDataset, _ATTRS_NO_LOOKUP from .iterator import PartitionIterator from .utils import from_pandas, is_scalar +from .accessor import CachedAccessor, SparseAccessor @_inherit_docstrings(pandas.Series, excluded=[pandas.Series.__init__]) @@ -1187,9 +1188,7 @@ def sort_values( result._query_compiler, inplace=inplace ) - @property - def sparse(self): - return self._default_to_pandas(pandas.Series.sparse) + sparse = CachedAccessor("sparse", SparseAccessor) def squeeze(self, axis=None): if axis is not None: diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index b8b39c203da..552a9fa7480 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -1151,6 +1151,13 @@ def test___bool__(data): eval_general(*create_test_dfs(data), lambda df: df.__bool__()) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_hasattr_sparse(data): - eval_general(*create_test_dfs(data), lambda df: hasattr(df, "sparse")) +@pytest.mark.parametrize( + "is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"] +) +def test_hasattr_sparse(is_sparse_data): + modin_df, pandas_df = ( + create_test_dfs(pandas.arrays.SparseArray(test_data["float_nan_data"].values())) + if is_sparse_data + else create_test_dfs(test_data["float_nan_data"]) + ) + eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse")) diff --git a/modin/pandas/test/test_api.py b/modin/pandas/test/test_api.py index 319ae2bf505..abb907f639b 100644 --- a/modin/pandas/test/test_api.py +++ b/modin/pandas/test/test_api.py @@ -48,6 +48,7 @@ def test_top_level_api_equality(): "DEFAULT_NPARTITIONS", "iterator", "series", + "accessor", "base", "utils", "dataframe", diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index ec4aba6879a..990c5c0292d 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -4397,17 +4397,18 @@ def test_encode(data, encoding_type): df_equals(modin_result, pandas_result) -@pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) -def test_hasattr_sparse(data): - modin_series, pandas_series = create_test_series(data) - try: - pandas_result = hasattr(pandas_series, "sparse") - except Exception as e: - with pytest.raises(type(e)): - hasattr(modin_series, "sparse") - else: - modin_result = hasattr(modin_series, "sparse") - assert modin_result == pandas_result +@pytest.mark.parametrize( + "is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"] +) +def test_hasattr_sparse(is_sparse_data): + modin_df, pandas_df = ( + create_test_series( + pandas.arrays.SparseArray(test_data["float_nan_data"].values()) + ) + if is_sparse_data + else create_test_series(test_data["float_nan_data"]) + ) + eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse")) @pytest.mark.parametrize(