diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 24ec56af875..89014b197b8 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -171,8 +171,9 @@ def _numeric_only_reduce_fn(applier: Type[Function], *funcs) -> Callable: """ def caller(self, *args, **kwargs): - # If `numeric_only` is None then we don't know what columns/indices will - # be dropped at the result of reduction function, and so can't preserve labels + # If `numeric_only` is None and the frame contains non-numeric columns, + # then we don't know what columns/indices will be dropped at the result + # of reduction function, and so can't preserve labels preserve_index = kwargs.get("numeric_only", None) is not None return applier.register(*funcs, preserve_index=preserve_index)( self, *args, **kwargs diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index c1b609bf3d5..422327e2838 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -330,12 +330,14 @@ def _validate_axis_equality(self, axis: int, force: bool = False): def _validate_internal_indices(self, mode=None, **kwargs): """ Validates and optionally updates internal and external indices - of modin_frame in specified mode. There is 3 modes supported: - 1. "reduced" - force validates on that axes - where external indices is ["__reduced__"] - 2. "all" - validates indices at all axes, optionally force + of modin_frame in specified mode. There is 4 modes supported: + 1. "reduced" - validates on that axis where external + indices is ["__reduced__"] for not force + 2. "reduced+other" - validates on axis where external + indices is ["__reduced__"] for not force, and force on another axis + 3. "all" - validates indices at all axes, optionally force if `force` parameter specified in kwargs - 3. "custom" - validation follows arguments specified in kwargs. + 4. "custom" - validation follows arguments specified in kwargs. Parameters ---------- @@ -353,26 +355,38 @@ def _validate_internal_indices(self, mode=None, **kwargs): is_force = kwargs.get("force", False) reduced_sample = pandas.Index(["__reduced__"]) + + is_axis_reduced = [self.axes[i].equals(reduced_sample) for i in [0, 1]] + args_dict = { "custom": kwargs, "reduced": { - "validate_index": self.index.equals(reduced_sample), - "validate_columns": self.columns.equals(reduced_sample), - "force": False, + "validate_index": is_axis_reduced[0], + "validate_columns": is_axis_reduced[1], + "force": [False, False], + }, + "reduced+other": { + "validate_index": True, + "validate_columns": True, + "force": [not is_axis_reduced[0], not is_axis_reduced[1]], }, "all": { "validate_index": True, "validate_columns": True, - "force": is_force, + "force": [is_force, is_force], }, } args = args_dict.get(mode, args_dict["custom"]) + def force_getter(axis): + force = args.get("force", False) + return force[axis] if isinstance(force, list) else force + if args.get("validate_index", True): - self._validate_axis_equality(axis=0, force=args.get("force")) + self._validate_axis_equality(axis=0, force=force_getter(0)) if args.get("validate_columns", True): - self._validate_axis_equality(axis=1, force=args.get("force")) + self._validate_axis_equality(axis=1, force=force_getter(1)) def _apply_index_objs(self, axis=None): """Lazily applies the index object (Index or Columns) to the partitions. @@ -1026,18 +1040,12 @@ def _compute_map_reduce_metadata(self, axis, new_parts, preserve_index=True): new_axes, new_axes_lengths = [0, 0], [0, 0] new_axes[axis] = ["__reduced__"] - new_axes[axis ^ 1] = ( - self.axes[axis ^ 1] - if preserve_index - else self._compute_axis_labels(axis ^ 1, new_parts) - ) + new_axes[axis ^ 1] = self.axes[axis ^ 1] new_axes_lengths[axis] = [1] - new_axes_lengths[axis ^ 1] = ( - self._axes_lengths[axis ^ 1] if preserve_index else None - ) + new_axes_lengths[axis ^ 1] = self._axes_lengths[axis ^ 1] - if (axis == 0 or self._dtypes is None) and preserve_index: + if axis == 0 or self._dtypes is None: new_dtypes = self._dtypes elif preserve_index: new_dtypes = pandas.Series( @@ -1051,7 +1059,7 @@ def _compute_map_reduce_metadata(self, axis, new_parts, preserve_index=True): *new_axes, *new_axes_lengths, new_dtypes, - validate_axes="reduced", + validate_axes=("reduced" if preserve_index else "reduced+other"), ) def _fold_reduce(self, axis, func, preserve_index=True): diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 9b4b8a4f28a..3fad9d11837 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1449,8 +1449,18 @@ def _stat_operation( """ axis = self._get_axis_number(axis) - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) + # If `numeric_only` is None, then we can do this precheck to whether or not + # frame contains non-numeric columns, if it doesn't, then we can pass to a backend + # `numeric_only=False` parameter and make its work easier in that case, rather than + # performing under complicate `numeric_only=None` parameter + if not numeric_only: + try: + self._validate_dtypes(numeric_only=True) + except TypeError: + if numeric_only is not None: + raise + else: + numeric_only = False data = self._get_numeric_data(axis) if numeric_only else self diff --git a/modin/pandas/test/dataframe/test_reduction.py b/modin/pandas/test/dataframe/test_reduction.py index 1acdd3e97c5..83ada627687 100644 --- a/modin/pandas/test/dataframe/test_reduction.py +++ b/modin/pandas/test/dataframe/test_reduction.py @@ -358,11 +358,14 @@ def test_sum_single_column(data): @pytest.mark.parametrize( "fn", ["max", "min", "median", "mean", "skew", "kurt", "sem", "std", "var"] ) +@pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) ) -def test_reduction_specific(fn, numeric_only): +def test_reduction_specific(fn, numeric_only, axis): + if fn == "mean" and axis == 1: + pytest.skip("See issue #2313 for details") eval_general( *create_test_dfs(test_data_diff_dtype), - lambda df: getattr(df, fn)(numeric_only=numeric_only), + lambda df: getattr(df, fn)(numeric_only=numeric_only, axis=axis), )