From 2294a3e42cadb3433a665d01ac426e049a061264 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Fri, 11 Jun 2021 07:22:47 -0500 Subject: [PATCH] Create objects from iterables that contain cudf.NA (#8442) Closes https://github.com/rapidsai/cudf/issues/8287 PyArrow knows how to handle the `pd.NA` singleton and knows it represents nulls if `from_pandas=True` during array construction. There's not an option to choose what sentinel or value is used to represent null however and the 'detection' of which values are exactly this object is implemented at the c++ level in pyarrow, limiting our options for 'tricking' pyarrow into seeing `cudf.NA` as null. As such it is probably best that our `NA` be identically the pandas `NA`. This also makes `cudf.NA is pd.NA` return true, which is probably what we want as well. Authors: - https://github.com/brandon-b-miller Approvers: - Marlene (https://github.com/marlenezw) - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/8442 --- python/cudf/cudf/core/scalar.py | 15 ++++++-------- python/cudf/cudf/tests/test_categorical.py | 16 ++++++++++++++- python/cudf/cudf/tests/test_decimal.py | 11 ++++++++++ python/cudf/cudf/tests/test_list.py | 10 +++++++++ python/cudf/cudf/tests/test_numerical.py | 24 +++++++++++++++++++++- python/cudf/cudf/tests/test_scalar.py | 6 +----- python/cudf/cudf/tests/test_struct.py | 9 ++++++++ 7 files changed, 75 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 523a7ceeaa1..494671d802f 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -3,6 +3,7 @@ import numpy as np import pyarrow as pa +from pandas._libs.missing import NAType as pd_NAType from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar from cudf.core.column.column import ColumnBase @@ -354,15 +355,11 @@ def astype(self, dtype): return Scalar(self.value, dtype) -class _NAType(object): - def __init__(self): - pass - - def __repr__(self): - return "" - - def __bool__(self): - raise TypeError("boolean value of cudf.NA is ambiguous") +class _NAType(pd_NAType): + # Pandas NAType enforces a single instance exists at a time + # instantiating this class will yield the existing instance + # of pandas._libs.missing.NAType, id(cudf.NA) == id(pd.NA). + pass NA = _NAType() diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index a117c15f14d..3d099515e28 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -9,7 +9,7 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.tests.utils import assert_eq, assert_exceptions_equal +from cudf.tests.utils import NUMERIC_TYPES, assert_eq, assert_exceptions_equal @pytest.fixture @@ -790,3 +790,17 @@ def test_categorical_setitem_with_nan(): [1, np.nan, np.nan, np.nan, np.nan, None], nan_as_null=False ).astype(gs.dtype) assert_eq(gs, expected_series) + + +@pytest.mark.parametrize("dtype", list(NUMERIC_TYPES) + ["object"]) +@pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) +def test_series_construction_with_nulls(input_obj, dtype): + dtype = np.dtype(dtype) + input_obj = [ + dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj + ] + + expect = pd.Series(input_obj, dtype="category") + got = cudf.Series(input_obj, dtype="category").to_pandas() + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index 073a8e443c7..92ef8d9513d 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -1,5 +1,6 @@ # Copyright (c) 2021, NVIDIA CORPORATION. +import decimal from decimal import Decimal import numpy as np @@ -286,3 +287,13 @@ def test_series_setitem_decimal(args): expect = _decimal_series(expect, dtype) data[to] = item assert_eq(data, expect) + + +@pytest.mark.parametrize( + "input_obj", [[decimal.Decimal(1), cudf.NA, decimal.Decimal(3)]] +) +def test_series_construction_with_nulls(input_obj): + expect = pa.array(input_obj, from_pandas=True) + got = cudf.Series(input_obj).to_arrow() + + assert expect == got diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 7edcb08a7c8..7f123a4311c 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -350,3 +350,13 @@ def test_list_getitem(indata, expect): list_sr = cudf.Series([indata]) # __getitem__ shall fill None with cudf.NA assert list_sr[0] == expect + + +@pytest.mark.parametrize( + "input_obj", [[[1, cudf.NA, 3]], [[1, cudf.NA, 3], [4, 5, cudf.NA]]] +) +def test_construction_series_with_nulls(input_obj): + expect = pa.array(input_obj, from_pandas=True) + got = cudf.Series(input_obj).to_arrow() + + assert expect == got diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 6d9bcda2c0b..12b17447268 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -6,7 +6,8 @@ import cudf from cudf.core._compat import PANDAS_GE_100 -from cudf.tests.utils import assert_eq +from cudf.tests.utils import NUMERIC_TYPES, assert_eq +from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes def test_can_cast_safely_same_kind(): @@ -384,3 +385,24 @@ def test_to_numeric_error(data, errors): got = cudf.to_numeric(data, errors=errors) assert_eq(expect, got) + + +@pytest.mark.parametrize("dtype", NUMERIC_TYPES) +@pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) +def test_series_construction_with_nulls(dtype, input_obj): + dtype = np.dtype(dtype) + # numpy case + + expect = pd.Series(input_obj, dtype=cudf_dtypes_to_pandas_dtypes[dtype]) + got = cudf.Series(input_obj, dtype=dtype).to_pandas(nullable=True) + + assert_eq(expect, got) + + # Test numpy array of objects case + np_data = [ + dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj + ] + + expect = pd.Series(np_data, dtype=cudf_dtypes_to_pandas_dtypes[dtype]) + got = cudf.Series(np_data, dtype=dtype).to_pandas(nullable=True) + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 42939f8129a..01e6b52f526 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -329,11 +329,7 @@ def test_scalar_invalid_implicit_conversion(cls, dtype): try: cls(pd.NA) except TypeError as e: - - error = ( - str(e).replace("NAType", "_NAType").replace(" NA ", " cudf.NA ") - ) - with pytest.raises(TypeError, match=re.escape(str(error))): + with pytest.raises(TypeError, match=re.escape(str(e))): slr = pycudf_scalar(None, dtype=dtype) cls(slr) diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 3c211951dff..669905c5c92 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -1,6 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. import pandas as pd +import pyarrow as pa import pytest import cudf @@ -44,3 +45,11 @@ def test_struct_for_field(key, expect): expect = cudf.Series(expect) got = sr.struct.field(key) assert_eq(expect, got) + + +@pytest.mark.parametrize("input_obj", [[{"a": 1, "b": cudf.NA, "c": 3}]]) +def test_series_construction_with_nulls(input_obj): + expect = pa.array(input_obj, from_pandas=True) + got = cudf.Series(input_obj).to_arrow() + + assert expect == got