Skip to content

Commit

Permalink
FIX-#7248: Make sure '_validate_dtypes_sum_prod_mean' works correctly…
Browse files Browse the repository at this point in the history
… with datetime types (#7237)

Signed-off-by: Anatoly Myachev <[email protected]>
  • Loading branch information
anmyachev authored May 13, 2024
1 parent ac3cc90 commit 78dd171
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 47 deletions.
63 changes: 22 additions & 41 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1630,7 +1630,7 @@ def prod(
dtype=pandas.api.types.pandas_dtype("object"),
)

data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
data = self._validate_dtypes_prod_mean(axis, numeric_only, ignore_axis=True)
if min_count > 1:
return data._reduce_dimension(
data._query_compiler.prod_min_count(
Expand Down Expand Up @@ -2155,9 +2155,19 @@ def sum(
dtype=pandas.api.types.pandas_dtype("object"),
)

data = self._validate_dtypes_sum_prod_mean(
axis, numeric_only, ignore_axis=False
)
# We cannot add datetime types, so if we are summing a column with
# dtype datetime64 and cannot ignore non-numeric types, we must throw a
# TypeError.
if numeric_only is False and any(
dtype == pandas.api.types.pandas_dtype("datetime64[ns]")
for dtype in self.dtypes
):
raise TypeError(
"'DatetimeArray' with dtype datetime64[ns] does not support reduction 'sum'"
)

data = self._get_numeric_data(axis) if numeric_only else self

if min_count > 1:
return data._reduce_dimension(
data._query_compiler.sum_min_count(
Expand Down Expand Up @@ -3048,31 +3058,23 @@ def _validate_dtypes_min_max(self, axis, numeric_only) -> DataFrame:
"""
# If our DataFrame has both numeric and non-numeric dtypes then
# comparisons between these types do not make sense and we must raise a
# TypeError. The exception to this rule is when there are datetime and
# timedelta objects, in which case we proceed with the comparison
# without ignoring any non-numeric types. We must check explicitly if
# TypeError. We must check explicitly if
# numeric_only is False because if it is None, it will default to True
# if the operation fails with mixed dtypes.
if (
axis
and numeric_only is False
and np.unique([is_numeric_dtype(dtype) for dtype in self.dtypes]).size == 2
and not all([is_numeric_dtype(dtype) for dtype in self.dtypes])
):
# check if there are columns with dtypes datetime or timedelta
if all(
dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
for dtype in self.dtypes
):
raise TypeError("Cannot compare Numeric and Non-Numeric Types")
raise TypeError("Cannot compare Numeric and Non-Numeric Types")

return self._get_numeric_data(axis) if numeric_only else self

def _validate_dtypes_sum_prod_mean(
def _validate_dtypes_prod_mean(
self, axis, numeric_only, ignore_axis=False
) -> DataFrame:
"""
Validate data dtype for `sum`, `prod` and `mean` methods.
Validate data dtype for `prod` and `mean` methods.
Parameters
----------
Expand All @@ -3089,38 +3091,17 @@ def _validate_dtypes_sum_prod_mean(
-------
DataFrame
"""
# We cannot add datetime types, so if we are summing a column with
# dtype datetime64 and cannot ignore non-numeric types, we must throw a
# TypeError.
if (
not axis
and numeric_only is False
and any(
dtype == pandas.api.types.pandas_dtype("datetime64[ns]")
for dtype in self.dtypes
)
):
raise TypeError("Cannot add Timestamp Types")

# If our DataFrame has both numeric and non-numeric dtypes then
# operations between these types do not make sense and we must raise a
# TypeError. The exception to this rule is when there are datetime and
# timedelta objects, in which case we proceed with the comparison
# without ignoring any non-numeric types. We must check explicitly if
# TypeError. We must check explicitly if
# numeric_only is False because if it is None, it will default to True
# if the operation fails with mixed dtypes.
if (
(axis or ignore_axis)
and numeric_only is False
and np.unique([is_numeric_dtype(dtype) for dtype in self.dtypes]).size == 2
and not all([is_numeric_dtype(dtype) for dtype in self.dtypes])
):
# check if there are columns with dtypes datetime or timedelta
if all(
dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
for dtype in self.dtypes
):
raise TypeError("Cannot operate on Numeric and Non-Numeric Types")
raise TypeError("Cannot operate on Numeric and Non-Numeric Types")

return self._get_numeric_data(axis) if numeric_only else self

Expand Down
10 changes: 4 additions & 6 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1578,7 +1578,7 @@ def prod(
if min_count > len(new_index):
return np.nan

data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
data = self._validate_dtypes_prod_mean(axis, numeric_only, ignore_axis=True)
if min_count > 1:
return data._reduce_dimension(
data._query_compiler.prod_min_count(
Expand Down Expand Up @@ -1976,9 +1976,7 @@ def sum(
if min_count > len(new_index):
return np.nan

data = self._validate_dtypes_sum_prod_mean(
axis, numeric_only, ignore_axis=False
)
data = self._validate_dtypes_prod_mean(axis, numeric_only, ignore_axis=False)
if min_count > 1:
return data._reduce_dimension(
data._query_compiler.sum_min_count(
Expand Down Expand Up @@ -2410,11 +2408,11 @@ def _reduce_dimension(self, query_compiler) -> Series | Scalar:
"""
return query_compiler.to_pandas().squeeze()

def _validate_dtypes_sum_prod_mean(
def _validate_dtypes_prod_mean(
self, axis, numeric_only, ignore_axis=False
) -> Series:
"""
Validate data dtype for `sum`, `prod` and `mean` methods.
Validate data dtype for `prod` and `mean` methods.
Parameters
----------
Expand Down
32 changes: 32 additions & 0 deletions modin/tests/pandas/dataframe/test_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,38 @@ def test_sum_single_column(data):
df_equals(modin_df.sum(axis=1), pandas_df.sum(axis=1))


def test_sum_datetime64():
pd_ser = pandas.date_range(start="1/1/2018", end="1/08/2018")
modin_df, pandas_df = create_test_dfs({"A": pd_ser, "B": [1, 2, 3, 4, 5, 6, 7, 8]})
eval_general(
modin_df,
pandas_df,
lambda df: df.sum(),
expected_exception=TypeError(
"'DatetimeArray' with dtype datetime64[ns] does not support reduction 'sum'"
),
)


def test_min_datetime64():
pd_ser = pandas.date_range(start="1/1/2018", end="1/08/2018")
modin_df, pandas_df = create_test_dfs({"A": pd_ser, "B": [1, 2, 3, 4, 5, 6, 7, 8]})
eval_general(
modin_df,
pandas_df,
lambda df: df.min(),
)

eval_general(
modin_df,
pandas_df,
lambda df: df.min(axis=1),
# pandas raises: `TypeError: '<=' not supported between instances of 'Timestamp' and 'int'`
# while modin raises quite general: `TypeError("Cannot compare Numeric and Non-Numeric Types")`
expected_exception=False,
)


@pytest.mark.parametrize(
"fn", ["max", "min", "median", "mean", "skew", "kurt", "sem", "std", "var"]
)
Expand Down

0 comments on commit 78dd171

Please sign in to comment.