Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#1285: Add sem implementation for Series and DataFrame #2048

Merged
merged 1 commit into from
Sep 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/supported_apis/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ default to pandas.
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``select_dtypes`` | `select_dtypes`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``sem`` | `sem`_ | D | |
| ``sem`` | `sem`_ | Y | |
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``set_axis`` | `set_axis`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
Expand Down
2 changes: 1 addition & 1 deletion docs/supported_apis/series_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ the related section on `Defaulting to pandas`_.
+-----------------------------+---------------------------------+
| ``searchsorted`` | Y |
+-----------------------------+---------------------------------+
| ``sem`` | D |
| ``sem`` | Y |
+-----------------------------+---------------------------------+
| ``set_axis`` | Y |
+-----------------------------+---------------------------------+
Expand Down
12 changes: 12 additions & 0 deletions modin/backends/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,6 +670,18 @@ def skew(self, **kwargs):
"""
pass

@abc.abstractmethod
def sem(self, **kwargs):
"""
Returns standard deviation of the mean over requested axis.

Returns
-------
BaseQueryCompiler
QueryCompiler containing the standard deviation of the mean over requested axis.
"""
pass

@abc.abstractmethod
def std(self, **kwargs):
"""Returns standard deviation of each column or row.
Expand Down
1 change: 1 addition & 0 deletions modin/backends/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,7 @@ def sort_index_for_equal_values(result, ascending):
nunique = ReductionFunction.register(pandas.DataFrame.nunique)
skew = ReductionFunction.register(pandas.DataFrame.skew)
kurt = ReductionFunction.register(pandas.DataFrame.kurt)
sem = ReductionFunction.register(pandas.DataFrame.sem)
std = ReductionFunction.register(pandas.DataFrame.std)
var = ReductionFunction.register(pandas.DataFrame.var)
sum_min_count = ReductionFunction.register(pandas.DataFrame.sum)
Expand Down
13 changes: 0 additions & 13 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2544,19 +2544,6 @@ def sample(
query_compiler = self._query_compiler.getitem_row_array(samples)
return self.__constructor__(query_compiler=query_compiler)

def sem(
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
):
return self._default_to_pandas(
"sem",
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)

def set_axis(self, labels, axis=0, inplace=False):
"""Assign desired index to given axis.

Expand Down
53 changes: 53 additions & 0 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2180,6 +2180,59 @@ def is_dtype_instance_mapper(column, dtype):
]
return self.drop(columns=self.columns[indicate], inplace=False)

def sem(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Implementation of the method seems to me identical for Series and Dataframe.

Can implementation be defined in backends/pandas/query_compiler?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The implementation is specific for Series and DataFrame. Because we treat Series as DataFrame in the backend, the implementation can't be defined in backends/pandas/query_compiler.

self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
):
"""
Return unbiased standard error of the mean over requested axis.

Normalized by N-1 by default. This can be changed using the ddof argument

Parameters
----------
axis : {index (0), columns (1)}
skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA,
the result will be NA.
level : int or level name, default None
If the axis is a MultiIndex (hierarchical), count along a particular level,
collapsing into a Series.
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
where N represents the number of elements.
numeric_only : bool, default None
Include only float, int, boolean columns. If None, will attempt to use everything,
then use only numeric data. Not implemented for Series.

Returns
-------
Series or DataFrame (if level specified)
"""
axis = self._get_axis_number(axis)
if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
if level is not None:
return self.__constructor__(
query_compiler=self._query_compiler.sem(
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)
)
return self._reduce_dimension(
self._query_compiler.sem(
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)
)

def set_index(
self, keys, drop=True, append=False, inplace=False, verify_integrity=False
):
Expand Down
53 changes: 53 additions & 0 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1092,6 +1092,59 @@ def nsmallest(self, n=5, keep="first"):
"""
return Series(query_compiler=self._query_compiler.nsmallest(n=n, keep=keep))

def sem(
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
):
"""
Return unbiased standard error of the mean over requested axis.

Normalized by N-1 by default. This can be changed using the ddof argument

Parameters
----------
axis : {index (0), columns (1)}
skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA,
the result will be NA.
level : int or level name, default None
If the axis is a MultiIndex (hierarchical), count along a particular level,
collapsing into a Series.
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
where N represents the number of elements.
numeric_only : bool, default None
Include only float, int, boolean columns. If None, will attempt to use everything,
then use only numeric data. Not implemented for Series.

Returns
-------
Scalar or Series (if level specified)
"""
axis = self._get_axis_number(axis)
if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
if level is not None:
return self.__constructor__(
query_compiler=self._query_compiler.sem(
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)
)
return self._reduce_dimension(
self._query_compiler.sem(
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)
)

def slice_shift(self, periods=1, axis=0):
"""
Equivalent to `shift` without copying data.
Expand Down
1 change: 0 additions & 1 deletion modin/pandas/test/dataframe/test_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
("lookup", lambda df: {"row_labels": [0], "col_labels": ["int_col"]}),
("mask", lambda df: {"cond": df != 0}),
("pct_change", None),
("sem", None),
("__getstate__", None),
("to_xarray", None),
("pivot_table", lambda df: {"values": "int_col", "index": ["float_col"]}),
Expand Down
30 changes: 25 additions & 5 deletions modin/pandas/test/dataframe/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,16 +456,16 @@ def test_median_skew_transposed(axis, method):
),
],
)
@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "rank"])
def test_median_skew_std_var_rank_specific(numeric_only, method):
@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "rank", "sem"])
def test_median_skew_std_var_rank_sem_specific(numeric_only, method):
eval_general(
*create_test_dfs(test_data_diff_dtype),
lambda df: getattr(df, method)(numeric_only=numeric_only),
)


@pytest.mark.parametrize("method", ["median", "skew", "std", "var"])
def test_median_skew_std_var_1953(method):
@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "sem"])
def test_median_skew_std_var_sem_1953(method):
# See #1953 for details
arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]]
data = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
Expand Down Expand Up @@ -633,11 +633,31 @@ def test_rank_transposed(axis, na_option):
)


@pytest.mark.parametrize(
"skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys)
)
@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys))
def test_sem_float_nan_only(skipna, ddof):
eval_general(
*create_test_dfs(test_data["float_nan_data"]),
lambda df: df.sem(skipna=skipna, ddof=ddof),
)


@pytest.mark.parametrize("axis", ["rows", "columns"])
@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys))
def test_sem_int_only(axis, ddof):
eval_general(
*create_test_dfs(test_data["int_data"]),
lambda df: df.sem(axis=axis, ddof=ddof),
)


@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize(
"skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys)
)
@pytest.mark.parametrize("method", ["str", "var", "rank"])
@pytest.mark.parametrize("method", ["std", "var", "rank"])
def test_std_var_rank(axis, skipna, method):
eval_general(
*create_test_dfs(test_data["float_nan_data"]),
Expand Down
28 changes: 21 additions & 7 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2036,8 +2036,10 @@ def test_median(data, skipna):
df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna))


@pytest.mark.parametrize("method", ["median", "skew", "std", "sum", "var", "prod"])
def test_median_skew_std_sum_var_prod_1953(method):
@pytest.mark.parametrize(
"method", ["median", "skew", "std", "sum", "var", "prod", "sem"]
)
def test_median_skew_std_sum_var_prod_sem_1953(method):
# See #1953 for details
data = [3, 3, 3, 3, 3, 3, 3, 3, 3]
arrays = [
Expand Down Expand Up @@ -2736,11 +2738,23 @@ def test_searchsorted(
assert case


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_sem(data):
modin_series, _ = create_test_series(data) # noqa: F841
with pytest.warns(UserWarning):
modin_series.sem()
@pytest.mark.parametrize(
"skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys)
)
@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys))
def test_sem_float_nan_only(skipna, ddof):
eval_general(
*create_test_series(test_data["float_nan_data"]),
lambda df: df.sem(skipna=skipna, ddof=ddof),
)


@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys))
def test_sem_int_only(ddof):
eval_general(
*create_test_series(test_data["int_data"]),
lambda df: df.sem(ddof=ddof),
)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
Expand Down