Skip to content

Commit

Permalink
Create objects from iterables that contain cudf.NA (#8442)
Browse files Browse the repository at this point in the history
Closes #8287

PyArrow knows how to handle the `pd.NA` singleton and knows it represents nulls if `from_pandas=True` during array construction. There's not an option to choose what sentinel or value is used to represent null however and the 'detection' of which values are exactly this object is implemented at the c++ level in pyarrow, limiting our options for 'tricking' pyarrow into seeing `cudf.NA` as null. 

As such it is probably best that our `NA` be identically the pandas `NA`. This also makes `cudf.NA is pd.NA` return true, which is probably what we want as well.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Marlene  (https://github.com/marlenezw)
  - Michael Wang (https://github.com/isVoid)

URL: #8442
  • Loading branch information
brandon-b-miller authored Jun 11, 2021
1 parent 306ae4f commit 2294a3e
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 16 deletions.
15 changes: 6 additions & 9 deletions python/cudf/cudf/core/scalar.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import pyarrow as pa
from pandas._libs.missing import NAType as pd_NAType

from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
from cudf.core.column.column import ColumnBase
Expand Down Expand Up @@ -354,15 +355,11 @@ def astype(self, dtype):
return Scalar(self.value, dtype)


class _NAType(object):
def __init__(self):
pass

def __repr__(self):
return "<NA>"

def __bool__(self):
raise TypeError("boolean value of cudf.NA is ambiguous")
class _NAType(pd_NAType):
# Pandas NAType enforces a single instance exists at a time
# instantiating this class will yield the existing instance
# of pandas._libs.missing.NAType, id(cudf.NA) == id(pd.NA).
pass


NA = _NAType()
16 changes: 15 additions & 1 deletion python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import cudf
from cudf.core._compat import PANDAS_GE_110
from cudf.tests.utils import assert_eq, assert_exceptions_equal
from cudf.tests.utils import NUMERIC_TYPES, assert_eq, assert_exceptions_equal


@pytest.fixture
Expand Down Expand Up @@ -790,3 +790,17 @@ def test_categorical_setitem_with_nan():
[1, np.nan, np.nan, np.nan, np.nan, None], nan_as_null=False
).astype(gs.dtype)
assert_eq(gs, expected_series)


@pytest.mark.parametrize("dtype", list(NUMERIC_TYPES) + ["object"])
@pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]])
def test_series_construction_with_nulls(input_obj, dtype):
dtype = np.dtype(dtype)
input_obj = [
dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj
]

expect = pd.Series(input_obj, dtype="category")
got = cudf.Series(input_obj, dtype="category").to_pandas()

assert_eq(expect, got)
11 changes: 11 additions & 0 deletions python/cudf/cudf/tests/test_decimal.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

import decimal
from decimal import Decimal

import numpy as np
Expand Down Expand Up @@ -286,3 +287,13 @@ def test_series_setitem_decimal(args):
expect = _decimal_series(expect, dtype)
data[to] = item
assert_eq(data, expect)


@pytest.mark.parametrize(
"input_obj", [[decimal.Decimal(1), cudf.NA, decimal.Decimal(3)]]
)
def test_series_construction_with_nulls(input_obj):
expect = pa.array(input_obj, from_pandas=True)
got = cudf.Series(input_obj).to_arrow()

assert expect == got
10 changes: 10 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,3 +350,13 @@ def test_list_getitem(indata, expect):
list_sr = cudf.Series([indata])
# __getitem__ shall fill None with cudf.NA
assert list_sr[0] == expect


@pytest.mark.parametrize(
"input_obj", [[[1, cudf.NA, 3]], [[1, cudf.NA, 3], [4, 5, cudf.NA]]]
)
def test_construction_series_with_nulls(input_obj):
expect = pa.array(input_obj, from_pandas=True)
got = cudf.Series(input_obj).to_arrow()

assert expect == got
24 changes: 23 additions & 1 deletion python/cudf/cudf/tests/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

import cudf
from cudf.core._compat import PANDAS_GE_100
from cudf.tests.utils import assert_eq
from cudf.tests.utils import NUMERIC_TYPES, assert_eq
from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes


def test_can_cast_safely_same_kind():
Expand Down Expand Up @@ -384,3 +385,24 @@ def test_to_numeric_error(data, errors):
got = cudf.to_numeric(data, errors=errors)

assert_eq(expect, got)


@pytest.mark.parametrize("dtype", NUMERIC_TYPES)
@pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]])
def test_series_construction_with_nulls(dtype, input_obj):
dtype = np.dtype(dtype)
# numpy case

expect = pd.Series(input_obj, dtype=cudf_dtypes_to_pandas_dtypes[dtype])
got = cudf.Series(input_obj, dtype=dtype).to_pandas(nullable=True)

assert_eq(expect, got)

# Test numpy array of objects case
np_data = [
dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj
]

expect = pd.Series(np_data, dtype=cudf_dtypes_to_pandas_dtypes[dtype])
got = cudf.Series(np_data, dtype=dtype).to_pandas(nullable=True)
assert_eq(expect, got)
6 changes: 1 addition & 5 deletions python/cudf/cudf/tests/test_scalar.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,11 +329,7 @@ def test_scalar_invalid_implicit_conversion(cls, dtype):
try:
cls(pd.NA)
except TypeError as e:

error = (
str(e).replace("NAType", "_NAType").replace(" NA ", " cudf.NA ")
)
with pytest.raises(TypeError, match=re.escape(str(error))):
with pytest.raises(TypeError, match=re.escape(str(e))):
slr = pycudf_scalar(None, dtype=dtype)
cls(slr)

Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/tests/test_struct.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020, NVIDIA CORPORATION.

import pandas as pd
import pyarrow as pa
import pytest

import cudf
Expand Down Expand Up @@ -44,3 +45,11 @@ def test_struct_for_field(key, expect):
expect = cudf.Series(expect)
got = sr.struct.field(key)
assert_eq(expect, got)


@pytest.mark.parametrize("input_obj", [[{"a": 1, "b": cudf.NA, "c": 3}]])
def test_series_construction_with_nulls(input_obj):
expect = pa.array(input_obj, from_pandas=True)
got = cudf.Series(input_obj).to_arrow()

assert expect == got

0 comments on commit 2294a3e

Please sign in to comment.