Skip to content

Commit

Permalink
Switch nan_as_null to False by default in pandas compatibility mo…
Browse files Browse the repository at this point in the history
…de (#137)

This PR resolves an issue in `from_pandas` API where `nan`'s present in a pandas series are being converted to `<NA>` values in `cudf` resulting in incorrect column representation:
```python
In [1]: import pandas as pd

In [2]: import numpy as np

In [3]: import cudf

In [4]: s = pd.Series([np.nan, "a", "b"])

In [5]: s
Out[5]: 
0    NaN
1      a
2      b
dtype: object

In [6]: cudf.set_option("mode.pandas_compatible", True)

In [7]: gs = cudf.from_pandas(s)

In [8]: gs
Out[8]: 
0    <NA>
1       a
2       b
dtype: object

In [9]: gs = cudf.from_pandas(s, nan_as_null=False)
---------------------------------------------------------------------------
ArrowInvalid                              Traceback (most recent call last)
Cell In[9], line 1
----> 1 gs = cudf.from_pandas(s, nan_as_null=False)

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/nvtx/nvtx.py:115, in annotate.__call__.<locals>.inner(*args, **kwargs)
    112 @wraps(func)
    113 def inner(*args, **kwargs):
    114     libnvtx_push_range(self.attributes, self.domain.handle)
--> 115     result = func(*args, **kwargs)
    116     libnvtx_pop_range(self.domain.handle)
    117     return result

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/cudf/core/dataframe.py:7874, in from_pandas(obj, nan_as_null)
   7872     return DataFrame.from_pandas(obj, nan_as_null=nan_as_null)
   7873 elif isinstance(obj, pd.Series):
-> 7874     return Series.from_pandas(obj, nan_as_null=nan_as_null)
   7875 elif isinstance(obj, pd.MultiIndex):
   7876     return MultiIndex.from_pandas(obj, nan_as_null=nan_as_null)

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/nvtx/nvtx.py:115, in annotate.__call__.<locals>.inner(*args, **kwargs)
    112 @wraps(func)
    113 def inner(*args, **kwargs):
    114     libnvtx_push_range(self.attributes, self.domain.handle)
--> 115     result = func(*args, **kwargs)
    116     libnvtx_pop_range(self.domain.handle)
    117     return result

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/cudf/core/series.py:731, in Series.from_pandas(cls, s, nan_as_null)
    729 with warnings.catch_warnings():
    730     warnings.simplefilter("ignore")
--> 731     result = cls(s, nan_as_null=nan_as_null)
    732 return result

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/nvtx/nvtx.py:115, in annotate.__call__.<locals>.inner(*args, **kwargs)
    112 @wraps(func)
    113 def inner(*args, **kwargs):
    114     libnvtx_push_range(self.attributes, self.domain.handle)
--> 115     result = func(*args, **kwargs)
    116     libnvtx_pop_range(self.domain.handle)
    117     return result

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/cudf/core/series.py:651, in Series.__init__(self, data, index, dtype, name, copy, nan_as_null)
    633 if not isinstance(data, ColumnBase):
    634     # Using `getattr_static` to check if
    635     # `data` is on device memory and perform
   (...)
    641     # be expensive or mark a buffer as
    642     # unspillable.
    643     has_cai = (
    644         type(
    645             inspect.getattr_static(
   (...)
    649         is property
    650     )
--> 651     data = column.as_column(
    652         data,
    653         nan_as_null=nan_as_null,
    654         dtype=dtype,
    655         length=len(index) if index is not None else None,
    656     )
    657     if copy and has_cai:
    658         data = data.copy(deep=True)

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/cudf/core/column/column.py:2073, in as_column(arbitrary, nan_as_null, dtype, length)
   2069 if cudf.get_option(
   2070     "mode.pandas_compatible"
   2071 ) and _is_pandas_nullable_extension_dtype(arbitrary.dtype):
   2072     raise NotImplementedError("not supported")
-> 2073 pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null)
   2074 if arbitrary.dtype == cudf.dtype("object") and cudf.dtype(
   2075     pyarrow_array.type.to_pandas_dtype()
   2076 ) != cudf.dtype(arbitrary.dtype):
   2077     raise MixedTypeError("Cannot create column with mixed types")

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/pyarrow/array.pxi:323, in pyarrow.lib.array()

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/pyarrow/array.pxi:83, in pyarrow.lib._ndarray_to_array()

File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()

ArrowInvalid: Could not convert 'a' with type str: tried to convert to double

In [10]: gs = cudf.from_pandas(s, nan_as_null=True)

In [11]: gs.to_pandas()
Out[11]: 
0    None     # In `cudf.pandas` this is problematic because we started with `np.nan` and now ending with `None`.
1       a
2       b
dtype: object
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: rapidsai/cudf-private#137
  • Loading branch information
galipremsagar authored Oct 31, 2023
1 parent 94bc2f2 commit 658bca4
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 5 deletions.
7 changes: 6 additions & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1836,7 +1836,7 @@ def __array_function__(self, func, types, args, kwargs):
return NotImplemented

@classmethod
def from_pandas(cls, index, nan_as_null=None):
def from_pandas(cls, index, nan_as_null=no_default):
"""
Convert from a Pandas Index.
Expand Down Expand Up @@ -1866,6 +1866,11 @@ def from_pandas(cls, index, nan_as_null=None):
>>> cudf.Index.from_pandas(pdi, nan_as_null=False)
Float64Index([10.0, 20.0, 30.0, nan], dtype='float64')
"""
if nan_as_null is no_default:
nan_as_null = (
False if cudf.get_option("mode.pandas_compatible") else None
)

if not isinstance(index, pd.Index):
raise TypeError("not a pandas.Index")

Expand Down
14 changes: 12 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5184,7 +5184,7 @@ def to_pandas(self, nullable=False, **kwargs):

@classmethod
@_cudf_nvtx_annotate
def from_pandas(cls, dataframe, nan_as_null=None):
def from_pandas(cls, dataframe, nan_as_null=no_default):
"""
Convert from a Pandas DataFrame.
Expand Down Expand Up @@ -5213,6 +5213,11 @@ def from_pandas(cls, dataframe, nan_as_null=None):
1 1 2
2 3 4
"""
if nan_as_null is no_default:
nan_as_null = (
False if cudf.get_option("mode.pandas_compatible") else None
)

if isinstance(dataframe, pd.DataFrame):

if not dataframe.columns.is_unique:
Expand Down Expand Up @@ -7770,7 +7775,7 @@ def func(left, right, output):


@_cudf_nvtx_annotate
def from_pandas(obj, nan_as_null=None):
def from_pandas(obj, nan_as_null=no_default):
"""
Convert certain Pandas objects into the cudf equivalent.
Expand Down Expand Up @@ -7868,6 +7873,11 @@ def from_pandas(obj, nan_as_null=None):
>>> type(pmidx)
<class 'pandas.core.indexes.multi.MultiIndex'>
"""
if nan_as_null is no_default:
nan_as_null = (
False if cudf.get_option("mode.pandas_compatible") else None
)

if isinstance(obj, pd.DataFrame):
return DataFrame.from_pandas(obj, nan_as_null=nan_as_null)
elif isinstance(obj, pd.Series):
Expand Down
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1604,7 +1604,7 @@ def to_pandas(self, nullable=False, **kwargs):

@classmethod
@_cudf_nvtx_annotate
def from_pandas(cls, multiindex, nan_as_null=None):
def from_pandas(cls, multiindex, nan_as_null=no_default):
"""
Convert from a Pandas MultiIndex
Expand All @@ -1625,6 +1625,10 @@ def from_pandas(cls, multiindex, nan_as_null=None):
"""
if not isinstance(multiindex, pd.MultiIndex):
raise TypeError("not a pandas.MultiIndex")
if nan_as_null is no_default:
nan_as_null = (
False if cudf.get_option("mode.pandas_compatible") else None
)

# if `multiindex` has two or more levels that
# have the same name, then `multiindex.to_frame()`
Expand Down
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ def __contains__(self, item):

@classmethod
@_cudf_nvtx_annotate
def from_pandas(cls, s, nan_as_null=None):
def from_pandas(cls, s, nan_as_null=no_default):
"""
Convert from a Pandas Series.
Expand Down Expand Up @@ -726,6 +726,10 @@ def from_pandas(cls, s, nan_as_null=None):
3 NaN
dtype: float64
"""
if nan_as_null is no_default:
nan_as_null = (
False if cudf.get_option("mode.pandas_compatible") else None
)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
result = cls(s, nan_as_null=nan_as_null)
Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2575,3 +2575,10 @@ def test_series_categorical_missing_value_count():
actual = gs.value_counts()

assert_eq(expected, actual, check_dtype=False)


def test_series_error_nan_mixed_types():
ps = pd.Series([np.nan, "ab", "cd"])
with cudf.option_context("mode.pandas_compatible", True):
with pytest.raises(pa.ArrowInvalid):
cudf.from_pandas(ps)

0 comments on commit 658bca4

Please sign in to comment.