Skip to content

Commit

Permalink
FIX-#1927: Fix performance issue related to sparse attribute access
Browse files Browse the repository at this point in the history
Signed-off-by: Igoshev, Yaroslav <[email protected]>
  • Loading branch information
YarShev committed Oct 30, 2020
1 parent a11e7c9 commit 51f2930
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 20 deletions.
111 changes: 111 additions & 0 deletions modin/pandas/accessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import pandas
from pandas.core.arrays.sparse.dtype import SparseDtype

from modin.utils import _inherit_docstrings


class BaseSparseAccessor:
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."

def __init__(self, data=None):
self._parent = data
self._validate(data)

def _validate(self, data):
raise NotImplementedError

def _default_to_pandas(self, op, *args, **kwargs):
return self._parent._default_to_pandas(
lambda parent: op(parent.sparse, *args, **kwargs)
)


@_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseFrameAccessor)
class SparseFrameAccessor(BaseSparseAccessor):
def _validate(self, data):
dtypes = data.dtypes
if not all(isinstance(t, SparseDtype) for t in dtypes):
raise AttributeError(self._validation_msg)

@property
def density(self):
return self._parent._default_to_pandas(pandas.DataFrame.sparse).density

@classmethod
def from_spmatrix(cls, data, index=None, columns=None):
return cls._default_to_pandas(
pandas.DataFrame.sparse.from_spmatrix, data, index=index, columns=columns
)

def to_dense(self):
return self._default_to_pandas(pandas.DataFrame.sparse.to_dense)

def to_coo(self):
return self._default_to_pandas(pandas.DataFrame.sparse.to_coo)


@_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseAccessor)
class SparseAccessor(BaseSparseAccessor):
def _validate(self, data):
if not isinstance(data.dtype, SparseDtype):
raise AttributeError(self._validation_msg)

@property
def density(self):
return self._parent._default_to_pandas(pandas.Series.sparse).density

@property
def fill_value(self):
return self._parent._default_to_pandas(pandas.Series.sparse).fill_value

@property
def npoints(self):
return self._parent._default_to_pandas(pandas.Series.sparse).npoints

@property
def sp_values(self):
return self._parent._default_to_pandas(pandas.Series.sparse).sp_values

@classmethod
def from_coo(cls, A, dense_index=False):
return cls._default_to_pandas(
pandas.Series.sparse.from_coo, A, dense_index=dense_index
)

def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
return self._default_to_pandas(
pandas.Series.sparse.to_coo,
row_levels=row_levels,
column_levels=column_levels,
sort_labels=sort_labels,
)

def to_dense(self):
return self._default_to_pandas(pandas.Series.sparse.to_dense)


@_inherit_docstrings(pandas.core.accessor.CachedAccessor)
class CachedAccessor:
def __init__(self, name: str, accessor) -> None:
self._name = name
self._accessor = accessor

def __get__(self, obj, cls):
if obj is None:
return self._accessor
accessor_obj = self._accessor(obj)
object.__setattr__(obj, self._name, accessor_obj)
return accessor_obj
5 changes: 2 additions & 3 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from .series import Series
from .base import BasePandasDataset, _ATTRS_NO_LOOKUP
from .groupby import DataFrameGroupBy
from .accessor import CachedAccessor, SparseFrameAccessor


@_inherit_docstrings(pandas.DataFrame, excluded=[pandas.DataFrame.__init__])
Expand Down Expand Up @@ -1594,9 +1595,7 @@ def set_index(
if not inplace:
return frame

@property
def sparse(self):
return self._default_to_pandas(pandas.DataFrame.sparse)
sparse = CachedAccessor("sparse", SparseFrameAccessor)

def squeeze(self, axis=None):
axis = self._get_axis_number(axis) if axis is not None else None
Expand Down
5 changes: 2 additions & 3 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from .base import BasePandasDataset, _ATTRS_NO_LOOKUP
from .iterator import PartitionIterator
from .utils import from_pandas, is_scalar
from .accessor import CachedAccessor, SparseAccessor


@_inherit_docstrings(pandas.Series, excluded=[pandas.Series.__init__])
Expand Down Expand Up @@ -1187,9 +1188,7 @@ def sort_values(
result._query_compiler, inplace=inplace
)

@property
def sparse(self):
return self._default_to_pandas(pandas.Series.sparse)
sparse = CachedAccessor("sparse", SparseAccessor)

def squeeze(self, axis=None):
if axis is not None:
Expand Down
13 changes: 10 additions & 3 deletions modin/pandas/test/dataframe/test_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -1151,6 +1151,13 @@ def test___bool__(data):
eval_general(*create_test_dfs(data), lambda df: df.__bool__())


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_hasattr_sparse(data):
eval_general(*create_test_dfs(data), lambda df: hasattr(df, "sparse"))
@pytest.mark.parametrize(
"is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"]
)
def test_hasattr_sparse(is_sparse_data):
modin_df, pandas_df = (
create_test_dfs(pandas.arrays.SparseArray(test_data["float_nan_data"].values()))
if is_sparse_data
else create_test_dfs(test_data["float_nan_data"])
)
eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse"))
1 change: 1 addition & 0 deletions modin/pandas/test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def test_top_level_api_equality():
"DEFAULT_NPARTITIONS",
"iterator",
"series",
"accessor",
"base",
"utils",
"dataframe",
Expand Down
23 changes: 12 additions & 11 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4397,17 +4397,18 @@ def test_encode(data, encoding_type):
df_equals(modin_result, pandas_result)


@pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys)
def test_hasattr_sparse(data):
modin_series, pandas_series = create_test_series(data)
try:
pandas_result = hasattr(pandas_series, "sparse")
except Exception as e:
with pytest.raises(type(e)):
hasattr(modin_series, "sparse")
else:
modin_result = hasattr(modin_series, "sparse")
assert modin_result == pandas_result
@pytest.mark.parametrize(
"is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"]
)
def test_hasattr_sparse(is_sparse_data):
modin_df, pandas_df = (
create_test_series(
pandas.arrays.SparseArray(test_data["float_nan_data"].values())
)
if is_sparse_data
else create_test_series(test_data["float_nan_data"])
)
eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse"))


@pytest.mark.parametrize(
Expand Down

0 comments on commit 51f2930

Please sign in to comment.