FIX-#7248: Make sure '_validate_dtypes_sum_prod_mean' works correctly…

… with datetime types (#7237) Signed-off-by: Anatoly Myachev <[email protected]>
modin-project · May 13, 2024 · 78dd171 · 78dd171
1 parent ac3cc90
commit 78dd171
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 47 deletions.
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -1630,7 +1630,7 @@ def prod(
                 dtype=pandas.api.types.pandas_dtype("object"),
             )
 
-        data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
+        data = self._validate_dtypes_prod_mean(axis, numeric_only, ignore_axis=True)
         if min_count > 1:
             return data._reduce_dimension(
                 data._query_compiler.prod_min_count(
@@ -2155,9 +2155,19 @@ def sum(
                 dtype=pandas.api.types.pandas_dtype("object"),
             )
 
-        data = self._validate_dtypes_sum_prod_mean(
-            axis, numeric_only, ignore_axis=False
-        )
+        # We cannot add datetime types, so if we are summing a column with
+        # dtype datetime64 and cannot ignore non-numeric types, we must throw a
+        # TypeError.
+        if numeric_only is False and any(
+            dtype == pandas.api.types.pandas_dtype("datetime64[ns]")
+            for dtype in self.dtypes
+        ):
+            raise TypeError(
+                "'DatetimeArray' with dtype datetime64[ns] does not support reduction 'sum'"
+            )
+
+        data = self._get_numeric_data(axis) if numeric_only else self
+
         if min_count > 1:
             return data._reduce_dimension(
                 data._query_compiler.sum_min_count(
@@ -3048,31 +3058,23 @@ def _validate_dtypes_min_max(self, axis, numeric_only) -> DataFrame:
         """
         # If our DataFrame has both numeric and non-numeric dtypes then
         # comparisons between these types do not make sense and we must raise a
-        # TypeError. The exception to this rule is when there are datetime and
-        # timedelta objects, in which case we proceed with the comparison
-        # without ignoring any non-numeric types. We must check explicitly if
+        # TypeError. We must check explicitly if
         # numeric_only is False because if it is None, it will default to True
         # if the operation fails with mixed dtypes.
         if (
             axis
             and numeric_only is False
-            and np.unique([is_numeric_dtype(dtype) for dtype in self.dtypes]).size == 2
+            and not all([is_numeric_dtype(dtype) for dtype in self.dtypes])
         ):
-            # check if there are columns with dtypes datetime or timedelta
-            if all(
-                dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
-                and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
-                for dtype in self.dtypes
-            ):
-                raise TypeError("Cannot compare Numeric and Non-Numeric Types")
+            raise TypeError("Cannot compare Numeric and Non-Numeric Types")
 
         return self._get_numeric_data(axis) if numeric_only else self
 
-    def _validate_dtypes_sum_prod_mean(
+    def _validate_dtypes_prod_mean(
         self, axis, numeric_only, ignore_axis=False
     ) -> DataFrame:
         """
-        Validate data dtype for `sum`, `prod` and `mean` methods.
+        Validate data dtype for `prod` and `mean` methods.
 
         Parameters
         ----------
@@ -3089,38 +3091,17 @@ def _validate_dtypes_sum_prod_mean(
         -------
         DataFrame
         """
-        # We cannot add datetime types, so if we are summing a column with
-        # dtype datetime64 and cannot ignore non-numeric types, we must throw a
-        # TypeError.
-        if (
-            not axis
-            and numeric_only is False
-            and any(
-                dtype == pandas.api.types.pandas_dtype("datetime64[ns]")
-                for dtype in self.dtypes
-            )
-        ):
-            raise TypeError("Cannot add Timestamp Types")
-
         # If our DataFrame has both numeric and non-numeric dtypes then
         # operations between these types do not make sense and we must raise a
-        # TypeError. The exception to this rule is when there are datetime and
-        # timedelta objects, in which case we proceed with the comparison
-        # without ignoring any non-numeric types. We must check explicitly if
+        # TypeError. We must check explicitly if
         # numeric_only is False because if it is None, it will default to True
         # if the operation fails with mixed dtypes.
         if (
             (axis or ignore_axis)
             and numeric_only is False
-            and np.unique([is_numeric_dtype(dtype) for dtype in self.dtypes]).size == 2
+            and not all([is_numeric_dtype(dtype) for dtype in self.dtypes])
         ):
-            # check if there are columns with dtypes datetime or timedelta
-            if all(
-                dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
-                and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
-                for dtype in self.dtypes
-            ):
-                raise TypeError("Cannot operate on Numeric and Non-Numeric Types")
+            raise TypeError("Cannot operate on Numeric and Non-Numeric Types")
 
         return self._get_numeric_data(axis) if numeric_only else self
 

diff --git a/modin/pandas/series.py b/modin/pandas/series.py
@@ -1578,7 +1578,7 @@ def prod(
         if min_count > len(new_index):
             return np.nan
 
-        data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
+        data = self._validate_dtypes_prod_mean(axis, numeric_only, ignore_axis=True)
         if min_count > 1:
             return data._reduce_dimension(
                 data._query_compiler.prod_min_count(
@@ -1976,9 +1976,7 @@ def sum(
         if min_count > len(new_index):
             return np.nan
 
-        data = self._validate_dtypes_sum_prod_mean(
-            axis, numeric_only, ignore_axis=False
-        )
+        data = self._validate_dtypes_prod_mean(axis, numeric_only, ignore_axis=False)
         if min_count > 1:
             return data._reduce_dimension(
                 data._query_compiler.sum_min_count(
@@ -2410,11 +2408,11 @@ def _reduce_dimension(self, query_compiler) -> Series | Scalar:
         """
         return query_compiler.to_pandas().squeeze()
 
-    def _validate_dtypes_sum_prod_mean(
+    def _validate_dtypes_prod_mean(
         self, axis, numeric_only, ignore_axis=False
     ) -> Series:
         """
-        Validate data dtype for `sum`, `prod` and `mean` methods.
+        Validate data dtype for `prod` and `mean` methods.
 
         Parameters
         ----------

diff --git a/modin/tests/pandas/dataframe/test_reduce.py b/modin/tests/pandas/dataframe/test_reduce.py
@@ -363,6 +363,38 @@ def test_sum_single_column(data):
     df_equals(modin_df.sum(axis=1), pandas_df.sum(axis=1))
 
 
+def test_sum_datetime64():
+    pd_ser = pandas.date_range(start="1/1/2018", end="1/08/2018")
+    modin_df, pandas_df = create_test_dfs({"A": pd_ser, "B": [1, 2, 3, 4, 5, 6, 7, 8]})
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.sum(),
+        expected_exception=TypeError(
+            "'DatetimeArray' with dtype datetime64[ns] does not support reduction 'sum'"
+        ),
+    )
+
+
+def test_min_datetime64():
+    pd_ser = pandas.date_range(start="1/1/2018", end="1/08/2018")
+    modin_df, pandas_df = create_test_dfs({"A": pd_ser, "B": [1, 2, 3, 4, 5, 6, 7, 8]})
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.min(),
+    )
+
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: df.min(axis=1),
+        # pandas raises: `TypeError: '<=' not supported between instances of 'Timestamp' and 'int'`
+        # while modin raises quite general: `TypeError("Cannot compare Numeric and Non-Numeric Types")`
+        expected_exception=False,
+    )
+
+
 @pytest.mark.parametrize(
     "fn", ["max", "min", "median", "mean", "skew", "kurt", "sem", "std", "var"]
 )