From dfef0b09e3d10f44ded37b1aebdd972ba8beab60 Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Thu, 5 Aug 2021 14:49:38 -0400 Subject: [PATCH 1/8] cudf.NA fix --- python/cudf/cudf/core/_internals/where.py | 17 +++++++++++++---- python/cudf/cudf/tests/test_dataframe.py | 16 ++++++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 87dc1d8e01f..a6ce315b425 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -27,7 +27,9 @@ def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> ScalarLike: f"{type(other).__name__} to {col.dtype.name}" ) - return cudf.Scalar(other, dtype=col.dtype if other is None else None) + return cudf.Scalar( + other, dtype=col.dtype if other in {None, cudf.NA} else None + ) def _check_and_cast_columns_with_other( @@ -234,9 +236,16 @@ def where( if isinstance(frame, DataFrame): if hasattr(cond, "__cuda_array_interface__"): - cond = DataFrame( - cond, columns=frame._column_names, index=frame.index - ) + if isinstance(cond, DataFrame): + cond = DataFrame( + cond, columns=frame._column_names, index=frame.index + ) + else: + cond = DataFrame( + {name: cond for name in frame.columns}, + columns=frame._column_names, + index=frame.index, + ) elif ( hasattr(cond, "__array_interface__") and cond.__array_interface__["shape"] != frame.shape diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 21683d4bdd0..f0e1761d2fe 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8736,3 +8736,19 @@ def test_frame_series_where(): expected = gdf.where(gdf.notna(), gdf.mean()) actual = pdf.where(pdf.notna(), pdf.mean(), axis=1) assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}], +) +def test_frame_series_where_other(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expected = gdf.where(gdf["b"] == 1, cudf.NA) + actual = pdf.where(pdf["b"] == 1, pd.NA) + assert_eq(expected, actual) + + expected = gdf.where(gdf["b"] == 1, 0) + actual = pdf.where(pdf["b"] == 1, 0) + assert_eq(expected, actual) From 7ea271e96e404564bf45756475c74bb28706a034 Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Thu, 5 Aug 2021 17:14:47 -0400 Subject: [PATCH 2/8] frame._column_names + test update --- python/cudf/cudf/core/_internals/where.py | 3 +-- python/cudf/cudf/tests/test_dataframe.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index a6ce315b425..f3212ba4ce2 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -242,8 +242,7 @@ def where( ) else: cond = DataFrame( - {name: cond for name in frame.columns}, - columns=frame._column_names, + {name: cond for name in frame._column_names}, index=frame.index, ) elif ( diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f0e1761d2fe..c052a37ee30 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8747,7 +8747,8 @@ def test_frame_series_where_other(data): expected = gdf.where(gdf["b"] == 1, cudf.NA) actual = pdf.where(pdf["b"] == 1, pd.NA) - assert_eq(expected, actual) + for col in actual.columns: # dtypes will be different + assert_eq(actual[col].fillna(-1).values, expected[col].fillna(-1).values) expected = gdf.where(gdf["b"] == 1, 0) actual = pdf.where(pdf["b"] == 1, 0) From 90217944ef7cfbb91cfbf357e0691e9f9d5a5726 Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Thu, 5 Aug 2021 18:05:16 -0400 Subject: [PATCH 3/8] style update --- python/cudf/cudf/tests/test_dataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c052a37ee30..85dea7dc28d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8748,7 +8748,9 @@ def test_frame_series_where_other(data): expected = gdf.where(gdf["b"] == 1, cudf.NA) actual = pdf.where(pdf["b"] == 1, pd.NA) for col in actual.columns: # dtypes will be different - assert_eq(actual[col].fillna(-1).values, expected[col].fillna(-1).values) + assert_eq( + actual[col].fillna(-1).values, expected[col].fillna(-1).values + ) expected = gdf.where(gdf["b"] == 1, 0) actual = pdf.where(pdf["b"] == 1, 0) From ad075d4419943b1ef26a752c0f594737a1d546ae Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Tue, 10 Aug 2021 09:25:32 -0400 Subject: [PATCH 4/8] update so test_df_sr_mask_where passes again --- python/cudf/cudf/core/_internals/where.py | 10 +++++++++- python/cudf/cudf/tests/test_dataframe.py | 9 +++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index f3212ba4ce2..cdd03e5274e 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -3,8 +3,10 @@ import warnings from typing import Any, Optional, Tuple, Union, cast +import cupy import numpy as np import pandas as pd +from numba import cuda import cudf from cudf._typing import ColumnLike, ScalarLike @@ -236,7 +238,13 @@ def where( if isinstance(frame, DataFrame): if hasattr(cond, "__cuda_array_interface__"): - if isinstance(cond, DataFrame): + if ( + isinstance(cond, DataFrame) + or isinstance(cond, cupy._core.core.ndarray) + or isinstance( + cond, cuda.cudadrv.devicearray.DeviceNDArray + ) + ): cond = DataFrame( cond, columns=frame._column_names, index=frame.index ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 85dea7dc28d..8ced8d2ab82 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8747,10 +8747,11 @@ def test_frame_series_where_other(data): expected = gdf.where(gdf["b"] == 1, cudf.NA) actual = pdf.where(pdf["b"] == 1, pd.NA) - for col in actual.columns: # dtypes will be different - assert_eq( - actual[col].fillna(-1).values, expected[col].fillna(-1).values - ) + assert_eq( + actual.fillna(-1).values, + expected.fillna(-1).values, + check_dtype=False, + ) expected = gdf.where(gdf["b"] == 1, 0) actual = pdf.where(pdf["b"] == 1, 0) From f53327f5b6f6f4bc76c8000b16f55393ed3b0e92 Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Tue, 10 Aug 2021 09:44:45 -0400 Subject: [PATCH 5/8] style update --- python/cudf/cudf/core/_internals/where.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index cdd03e5274e..f123c022ce4 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -241,9 +241,7 @@ def where( if ( isinstance(cond, DataFrame) or isinstance(cond, cupy._core.core.ndarray) - or isinstance( - cond, cuda.cudadrv.devicearray.DeviceNDArray - ) + or isinstance(cond, cuda.cudadrv.devicearray.DeviceNDArray) ): cond = DataFrame( cond, columns=frame._column_names, index=frame.index From 65a4fc1ff557564bb5cc369c91a2bf47e788ab81 Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Wed, 11 Aug 2021 11:09:14 -0400 Subject: [PATCH 6/8] fix failing test --- python/cudf/cudf/utils/dtypes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 46bd1b449c4..829a1545365 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -581,6 +581,8 @@ def _can_cast(from_dtype, to_dtype): `np.can_cast` but with some special handling around cudf specific dtypes. """ + if from_dtype in {None, cudf.NA}: + return True if isinstance(from_dtype, type): from_dtype = np.dtype(from_dtype) if isinstance(to_dtype, type): From 22a27a8eb9f353ca649bc5b60ac4f864927344e5 Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Wed, 11 Aug 2021 12:45:26 -0400 Subject: [PATCH 7/8] swap conditionals --- python/cudf/cudf/core/_internals/where.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index f123c022ce4..cab75b72b03 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -238,18 +238,14 @@ def where( if isinstance(frame, DataFrame): if hasattr(cond, "__cuda_array_interface__"): - if ( - isinstance(cond, DataFrame) - or isinstance(cond, cupy._core.core.ndarray) - or isinstance(cond, cuda.cudadrv.devicearray.DeviceNDArray) - ): + if isinstance(cond, Series): cond = DataFrame( - cond, columns=frame._column_names, index=frame.index + {name: cond for name in frame._column_names}, + index=frame.index, ) else: cond = DataFrame( - {name: cond for name in frame._column_names}, - index=frame.index, + cond, columns=frame._column_names, index=frame.index ) elif ( hasattr(cond, "__array_interface__") From fad2856e8cd6670dfa7a676bd49f524da88044b7 Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Wed, 11 Aug 2021 12:55:21 -0400 Subject: [PATCH 8/8] remove unused imports --- python/cudf/cudf/core/_internals/where.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index cab75b72b03..4da7cd6bbd7 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -3,10 +3,8 @@ import warnings from typing import Any, Optional, Tuple, Union, cast -import cupy import numpy as np import pandas as pd -from numba import cuda import cudf from cudf._typing import ColumnLike, ScalarLike