Skip to content

Commit

Permalink
FEAT-#1285: Add sem implementation for Series and DataFrame
Browse files Browse the repository at this point in the history
Signed-off-by: Igoshev, Yaroslav <[email protected]>
  • Loading branch information
YarShev committed Sep 9, 2020
1 parent 9c03512 commit beb13ee
Show file tree
Hide file tree
Showing 10 changed files with 167 additions and 28 deletions.
2 changes: 1 addition & 1 deletion docs/supported_apis/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ default to pandas.
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``select_dtypes`` | `select_dtypes`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``sem`` | `sem`_ | D | |
| ``sem`` | `sem`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``set_axis`` | `set_axis`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
Expand Down
2 changes: 1 addition & 1 deletion docs/supported_apis/series_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ the related section on `Defaulting to pandas`_.
+-----------------------------+---------------------------------+
| ``searchsorted`` | Y |
+-----------------------------+---------------------------------+
| ``sem`` | D |
| ``sem`` | Y |
+-----------------------------+---------------------------------+
| ``set_axis`` | Y |
+-----------------------------+---------------------------------+
Expand Down
12 changes: 12 additions & 0 deletions modin/backends/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,6 +670,18 @@ def skew(self, **kwargs):
"""
pass

@abc.abstractmethod
def sem(self, **kwargs):
"""
Returns standard deviation of the mean over requested axis.
Returns
-------
BaseQueryCompiler
QueryCompiler containing the standard deviation of the mean over requested axis.
"""
pass

@abc.abstractmethod
def std(self, **kwargs):
"""Returns standard deviation of each column or row.
Expand Down
1 change: 1 addition & 0 deletions modin/backends/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,7 @@ def sort_index_for_equal_values(result, ascending):
nunique = ReductionFunction.register(pandas.DataFrame.nunique)
skew = ReductionFunction.register(pandas.DataFrame.skew)
kurt = ReductionFunction.register(pandas.DataFrame.kurt)
sem = ReductionFunction.register(pandas.DataFrame.sem)
std = ReductionFunction.register(pandas.DataFrame.std)
var = ReductionFunction.register(pandas.DataFrame.var)
sum_min_count = ReductionFunction.register(pandas.DataFrame.sum)
Expand Down
13 changes: 0 additions & 13 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2544,19 +2544,6 @@ def sample(
query_compiler = self._query_compiler.getitem_row_array(samples)
return self.__constructor__(query_compiler=query_compiler)

def sem(
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
):
return self._default_to_pandas(
"sem",
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)

def set_axis(self, labels, axis=0, inplace=False):
"""Assign desired index to given axis.
Expand Down
53 changes: 53 additions & 0 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2180,6 +2180,59 @@ def is_dtype_instance_mapper(column, dtype):
]
return self.drop(columns=self.columns[indicate], inplace=False)

def sem(
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
):
"""
Return unbiased standard error of the mean over requested axis.
Normalized by N-1 by default. This can be changed using the ddof argument
Parameters
----------
axis : {index (0), columns (1)}
skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA,
the result will be NA.
level : int or level name, default None
If the axis is a MultiIndex (hierarchical), count along a particular level,
collapsing into a Series.
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
where N represents the number of elements.
numeric_only : bool, default None
Include only float, int, boolean columns. If None, will attempt to use everything,
then use only numeric data. Not implemented for Series.
Returns
-------
Series or DataFrame (if level specified)
"""
axis = self._get_axis_number(axis)
if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
if level is not None:
return self.__constructor__(
query_compiler=self._query_compiler.sem(
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)
)
return self._reduce_dimension(
self._query_compiler.sem(
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)
)

def set_index(
self, keys, drop=True, append=False, inplace=False, verify_integrity=False
):
Expand Down
53 changes: 53 additions & 0 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1092,6 +1092,59 @@ def nsmallest(self, n=5, keep="first"):
"""
return Series(query_compiler=self._query_compiler.nsmallest(n=n, keep=keep))

def sem(
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
):
"""
Return unbiased standard error of the mean over requested axis.
Normalized by N-1 by default. This can be changed using the ddof argument
Parameters
----------
axis : {index (0), columns (1)}
skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA,
the result will be NA.
level : int or level name, default None
If the axis is a MultiIndex (hierarchical), count along a particular level,
collapsing into a Series.
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
where N represents the number of elements.
numeric_only : bool, default None
Include only float, int, boolean columns. If None, will attempt to use everything,
then use only numeric data. Not implemented for Series.
Returns
-------
Scalar or Series (if level specified)
"""
axis = self._get_axis_number(axis)
if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
if level is not None:
return self.__constructor__(
query_compiler=self._query_compiler.sem(
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)
)
return self._reduce_dimension(
self._query_compiler.sem(
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)
)

def slice_shift(self, periods=1, axis=0):
"""
Equivalent to `shift` without copying data.
Expand Down
1 change: 0 additions & 1 deletion modin/pandas/test/dataframe/test_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
("lookup", lambda df: {"row_labels": [0], "col_labels": ["int_col"]}),
("mask", lambda df: {"cond": df != 0}),
("pct_change", None),
("sem", None),
("__getstate__", None),
("to_xarray", None),
("pivot_table", lambda df: {"values": "int_col", "index": ["float_col"]}),
Expand Down
30 changes: 25 additions & 5 deletions modin/pandas/test/dataframe/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,16 +456,16 @@ def test_median_skew_transposed(axis, method):
),
],
)
@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "rank"])
def test_median_skew_std_var_rank_specific(numeric_only, method):
@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "rank", "sem"])
def test_median_skew_std_var_rank_sem_specific(numeric_only, method):
eval_general(
*create_test_dfs(test_data_diff_dtype),
lambda df: getattr(df, method)(numeric_only=numeric_only),
)


@pytest.mark.parametrize("method", ["median", "skew", "std", "var"])
def test_median_skew_std_var_1953(method):
@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "sem"])
def test_median_skew_std_var_sem_1953(method):
# See #1953 for details
arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]]
data = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
Expand Down Expand Up @@ -633,11 +633,31 @@ def test_rank_transposed(axis, na_option):
)


@pytest.mark.parametrize(
"skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys)
)
@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys))
def test_sem_float_nan_only(skipna, ddof):
eval_general(
*create_test_dfs(test_data["float_nan_data"]),
lambda df: df.sem(skipna=skipna, ddof=ddof),
)


@pytest.mark.parametrize("axis", ["rows", "columns"])
@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys))
def test_sem_int_only(axis, ddof):
eval_general(
*create_test_dfs(test_data["int_data"]),
lambda df: df.sem(axis=axis, ddof=ddof),
)


@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize(
"skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys)
)
@pytest.mark.parametrize("method", ["str", "var", "rank"])
@pytest.mark.parametrize("method", ["std", "var", "rank"])
def test_std_var_rank(axis, skipna, method):
eval_general(
*create_test_dfs(test_data["float_nan_data"]),
Expand Down
28 changes: 21 additions & 7 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2036,8 +2036,10 @@ def test_median(data, skipna):
df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna))


@pytest.mark.parametrize("method", ["median", "skew", "std", "sum", "var", "prod"])
def test_median_skew_std_sum_var_prod_1953(method):
@pytest.mark.parametrize(
"method", ["median", "skew", "std", "sum", "var", "prod", "sem"]
)
def test_median_skew_std_sum_var_prod_sem_1953(method):
# See #1953 for details
data = [3, 3, 3, 3, 3, 3, 3, 3, 3]
arrays = [
Expand Down Expand Up @@ -2736,11 +2738,23 @@ def test_searchsorted(
assert case


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_sem(data):
modin_series, _ = create_test_series(data) # noqa: F841
with pytest.warns(UserWarning):
modin_series.sem()
@pytest.mark.parametrize(
"skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys)
)
@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys))
def test_sem_float_nan_only(skipna, ddof):
eval_general(
*create_test_series(test_data["float_nan_data"]),
lambda df: df.sem(skipna=skipna, ddof=ddof),
)


@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys))
def test_sem_int_only(ddof):
eval_general(
*create_test_series(test_data["int_data"]),
lambda df: df.sem(ddof=ddof),
)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
Expand Down

0 comments on commit beb13ee

Please sign in to comment.