diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index ae4f6dc43f4..352f50f13d4 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -328,7 +328,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``select_dtypes`` | `select_dtypes`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``sem`` | `sem`_ | D | | +| ``sem`` | `sem`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``set_axis`` | `set_axis`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst index b8996ad74e3..283eaceafbb 100644 --- a/docs/supported_apis/series_supported.rst +++ b/docs/supported_apis/series_supported.rst @@ -362,7 +362,7 @@ the related section on `Defaulting to pandas`_. +-----------------------------+---------------------------------+ | ``searchsorted`` | Y | +-----------------------------+---------------------------------+ -| ``sem`` | D | +| ``sem`` | Y | +-----------------------------+---------------------------------+ | ``set_axis`` | Y | +-----------------------------+---------------------------------+ diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 06a513b5854..adc00814ac7 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -670,6 +670,18 @@ def skew(self, **kwargs): """ pass + @abc.abstractmethod + def sem(self, **kwargs): + """ + Returns standard deviation of the mean over requested axis. + + Returns + ------- + BaseQueryCompiler + QueryCompiler containing the standard deviation of the mean over requested axis. + """ + pass + @abc.abstractmethod def std(self, **kwargs): """Returns standard deviation of each column or row. diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index d0c85507c7e..b6af408951a 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -740,6 +740,7 @@ def sort_index_for_equal_values(result, ascending): nunique = ReductionFunction.register(pandas.DataFrame.nunique) skew = ReductionFunction.register(pandas.DataFrame.skew) kurt = ReductionFunction.register(pandas.DataFrame.kurt) + sem = ReductionFunction.register(pandas.DataFrame.sem) std = ReductionFunction.register(pandas.DataFrame.std) var = ReductionFunction.register(pandas.DataFrame.var) sum_min_count = ReductionFunction.register(pandas.DataFrame.sum) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 6ab559f9212..542545fad5d 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -2544,19 +2544,6 @@ def sample( query_compiler = self._query_compiler.getitem_row_array(samples) return self.__constructor__(query_compiler=query_compiler) - def sem( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - return self._default_to_pandas( - "sem", - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - def set_axis(self, labels, axis=0, inplace=False): """Assign desired index to given axis. diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index bae377e57c9..1b4655ad285 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -2180,6 +2180,59 @@ def is_dtype_instance_mapper(column, dtype): ] return self.drop(columns=self.columns[indicate], inplace=False) + def sem( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """ + Return unbiased standard error of the mean over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument + + Parameters + ---------- + axis : {index (0), columns (1)} + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, + the result will be NA. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + + Returns + ------- + Series or DataFrame (if level specified) + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.sem( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.sem( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + def set_index( self, keys, drop=True, append=False, inplace=False, verify_integrity=False ): diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 382f759c732..556d26cab53 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1092,6 +1092,59 @@ def nsmallest(self, n=5, keep="first"): """ return Series(query_compiler=self._query_compiler.nsmallest(n=n, keep=keep)) + def sem( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """ + Return unbiased standard error of the mean over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument + + Parameters + ---------- + axis : {index (0), columns (1)} + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, + the result will be NA. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + + Returns + ------- + Scalar or Series (if level specified) + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.sem( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.sem( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + def slice_shift(self, periods=1, axis=0): """ Equivalent to `shift` without copying data. diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index c07849f2231..ddb6bc080d1 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -62,7 +62,6 @@ ("lookup", lambda df: {"row_labels": [0], "col_labels": ["int_col"]}), ("mask", lambda df: {"cond": df != 0}), ("pct_change", None), - ("sem", None), ("__getstate__", None), ("to_xarray", None), ("pivot_table", lambda df: {"values": "int_col", "index": ["float_col"]}), diff --git a/modin/pandas/test/dataframe/test_window.py b/modin/pandas/test/dataframe/test_window.py index ce2bc0e2ba2..ff8ce9c5117 100644 --- a/modin/pandas/test/dataframe/test_window.py +++ b/modin/pandas/test/dataframe/test_window.py @@ -456,16 +456,16 @@ def test_median_skew_transposed(axis, method): ), ], ) -@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "rank"]) -def test_median_skew_std_var_rank_specific(numeric_only, method): +@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "rank", "sem"]) +def test_median_skew_std_var_rank_sem_specific(numeric_only, method): eval_general( *create_test_dfs(test_data_diff_dtype), lambda df: getattr(df, method)(numeric_only=numeric_only), ) -@pytest.mark.parametrize("method", ["median", "skew", "std", "var"]) -def test_median_skew_std_var_1953(method): +@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "sem"]) +def test_median_skew_std_var_sem_1953(method): # See #1953 for details arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] data = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]] @@ -633,11 +633,31 @@ def test_rank_transposed(axis, na_option): ) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) +def test_sem_float_nan_only(skipna, ddof): + eval_general( + *create_test_dfs(test_data["float_nan_data"]), + lambda df: df.sem(skipna=skipna, ddof=ddof), + ) + + +@pytest.mark.parametrize("axis", ["rows", "columns"]) +@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) +def test_sem_int_only(axis, ddof): + eval_general( + *create_test_dfs(test_data["int_data"]), + lambda df: df.sem(axis=axis, ddof=ddof), + ) + + @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) -@pytest.mark.parametrize("method", ["str", "var", "rank"]) +@pytest.mark.parametrize("method", ["std", "var", "rank"]) def test_std_var_rank(axis, skipna, method): eval_general( *create_test_dfs(test_data["float_nan_data"]), diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index d21267c8242..5546367b56f 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2036,8 +2036,10 @@ def test_median(data, skipna): df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) -@pytest.mark.parametrize("method", ["median", "skew", "std", "sum", "var", "prod"]) -def test_median_skew_std_sum_var_prod_1953(method): +@pytest.mark.parametrize( + "method", ["median", "skew", "std", "sum", "var", "prod", "sem"] +) +def test_median_skew_std_sum_var_prod_sem_1953(method): # See #1953 for details data = [3, 3, 3, 3, 3, 3, 3, 3, 3] arrays = [ @@ -2736,11 +2738,23 @@ def test_searchsorted( assert case -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_sem(data): - modin_series, _ = create_test_series(data) # noqa: F841 - with pytest.warns(UserWarning): - modin_series.sem() +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) +def test_sem_float_nan_only(skipna, ddof): + eval_general( + *create_test_series(test_data["float_nan_data"]), + lambda df: df.sem(skipna=skipna, ddof=ddof), + ) + + +@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) +def test_sem_int_only(ddof): + eval_general( + *create_test_series(test_data["int_data"]), + lambda df: df.sem(ddof=ddof), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)