From 46a61c310de2cd154c999217afc1f6b381dca9df Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 29 Dec 2023 12:19:47 -0800 Subject: [PATCH 1/7] Fix nan_as_null not being respected when passing cudf object --- python/cudf/cudf/core/column/column.py | 22 ++++++++++------------ python/cudf/cudf/tests/test_series.py | 8 ++++++++ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 296fd6a41b0..163f2c3a659 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1939,20 +1939,18 @@ def as_column( * pyarrow array * pandas.Categorical objects """ - if isinstance(arbitrary, ColumnBase): - if dtype is not None: - return arbitrary.astype(dtype) + if isinstance(arbitrary, (ColumnBase, cudf.Series, cudf.BaseIndex)): + if isinstance(arbitrary, cudf.Series): + column = arbitrary._column + elif isinstance(arbitrary, cudf.BaseIndex): + column = arbitrary._values else: - return arbitrary - - elif isinstance(arbitrary, cudf.Series): - data = arbitrary._column + column = arbitrary + if column.dtype.kind == "f" and (nan_as_null is None or nan_as_null): + column = column.nans_to_nulls() if dtype is not None: - data = data.astype(dtype) - elif isinstance(arbitrary, cudf.BaseIndex): - data = arbitrary._values - if dtype is not None: - data = data.astype(dtype) + column = column.astype(dtype) + return column elif hasattr(arbitrary, "__cuda_array_interface__"): desc = arbitrary.__cuda_array_interface__ diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 39da34fa89c..473e13e3e65 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2606,6 +2606,14 @@ def test_series_error_nan_non_float_dtypes(): s[0] = np.nan +@pytest.mark.parametrize("klass", [cudf.Index, cudf.Series]) +def test_nan_as_null_from_cudf_objects(klass): + data = klass(pa.array([float("nan")])) + result = klass(data, nan_as_null=True) + expected = klass(pa.array([None], type=pa.float64())) + assert_eq(result, expected) + + @pytest.mark.parametrize( "dtype", [ From 45bb9e390229830ef21a080f5e7ecd134c18c31e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Jan 2024 10:18:29 -0800 Subject: [PATCH 2/7] add year --- python/cudf/cudf/core/column/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 163f2c3a659..7fe52ade90a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations From b569b62cba16ee9f328a14dd6f331c5f2191f917 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 10 Jan 2024 13:08:07 -0800 Subject: [PATCH 3/7] test concat with nan_as_null=False --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index fcb4e77f6a5..492854c9c2f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1704,7 +1704,7 @@ def _concat(cls, objs, axis=0, index=True): if len(objs): col = col._with_type_metadata(objs[0].dtype) - return cls(data=col, index=index, name=name) + return cls(data=col, index=index, name=name, nan_as_null=False) @property # type: ignore @_cudf_nvtx_annotate From 8855f408b34283f899fb94a65f1435257f5d3900 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 16:47:22 -0800 Subject: [PATCH 4/7] Adjust some tests --- python/cudf/cudf/core/reshape.py | 4 ++-- python/cudf/cudf/tests/test_dlpack.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 3cbe58ed39c..a7b2027f9f0 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import itertools import warnings @@ -770,7 +770,7 @@ def get_dummies( result_data.update(col_enc_data) return cudf.DataFrame._from_data(result_data, index=df._index) else: - ser = cudf.Series(df) + ser = cudf.Series(df, nan_as_null=False) unique = _get_unique(column=ser._column, dummy_na=dummy_na) data = _one_hot_encode_column( column=ser._column, diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index 6e34817c4fd..0b3d03d606a 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import itertools from contextlib import ExitStack as does_not_raise @@ -63,7 +63,7 @@ def data_2d(request): def test_to_dlpack_dataframe(data_2d): - expectation = data_size_expectation_builder(data_2d) + expectation = data_size_expectation_builder(data_2d, nan_null_param=True) with expectation: gdf = cudf.DataFrame.from_records(data_2d) @@ -122,7 +122,7 @@ def test_to_dlpack_cupy_1d(data_1d): def test_to_dlpack_cupy_2d(data_2d): - expectation = data_size_expectation_builder(data_2d) + expectation = data_size_expectation_builder(data_2d, nan_null_param=True) with expectation: gdf = cudf.DataFrame.from_records(data_2d) From 79c318f4f35bf966b60575d6a225890edc421eb6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 17:34:12 -0800 Subject: [PATCH 5/7] Make as_column nan_as_null=False by default --- python/cudf/cudf/core/column/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index cff32396cdd..106e1fe2f35 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1884,7 +1884,7 @@ def as_memoryview(arbitrary: Any) -> Optional[memoryview]: def as_column( arbitrary: Any, - nan_as_null: Optional[bool] = None, + nan_as_null: Optional[bool] = False, dtype: Optional[Dtype] = None, length: Optional[int] = None, ): From 4c7e3a52de60759a36c78efa202429dc68e1793f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 17:41:34 -0800 Subject: [PATCH 6/7] Use _from_data --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index bb2e77fc317..aa4905b9fb2 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1702,7 +1702,7 @@ def _concat(cls, objs, axis=0, index=True): if len(objs): col = col._with_type_metadata(objs[0].dtype) - return cls(data=col, index=index, name=name, nan_as_null=False) + return cls._from_data({name: col}, index=index) @property # type: ignore @_cudf_nvtx_annotate From 302df525d1deb75a2f978daf85ed334ea146f3e0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 12 Jan 2024 10:26:14 -0800 Subject: [PATCH 7/7] Revert test_dlpack --- python/cudf/cudf/tests/test_dlpack.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index 0b3d03d606a..1a2890f09a1 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import itertools from contextlib import ExitStack as does_not_raise @@ -63,7 +63,7 @@ def data_2d(request): def test_to_dlpack_dataframe(data_2d): - expectation = data_size_expectation_builder(data_2d, nan_null_param=True) + expectation = data_size_expectation_builder(data_2d) with expectation: gdf = cudf.DataFrame.from_records(data_2d) @@ -122,7 +122,7 @@ def test_to_dlpack_cupy_1d(data_1d): def test_to_dlpack_cupy_2d(data_2d): - expectation = data_size_expectation_builder(data_2d, nan_null_param=True) + expectation = data_size_expectation_builder(data_2d) with expectation: gdf = cudf.DataFrame.from_records(data_2d)