Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Switch
nan_as_null
to False
by default in pandas compatibility mo…
…de (#137) This PR resolves an issue in `from_pandas` API where `nan`'s present in a pandas series are being converted to `<NA>` values in `cudf` resulting in incorrect column representation: ```python In [1]: import pandas as pd In [2]: import numpy as np In [3]: import cudf In [4]: s = pd.Series([np.nan, "a", "b"]) In [5]: s Out[5]: 0 NaN 1 a 2 b dtype: object In [6]: cudf.set_option("mode.pandas_compatible", True) In [7]: gs = cudf.from_pandas(s) In [8]: gs Out[8]: 0 <NA> 1 a 2 b dtype: object In [9]: gs = cudf.from_pandas(s, nan_as_null=False) --------------------------------------------------------------------------- ArrowInvalid Traceback (most recent call last) Cell In[9], line 1 ----> 1 gs = cudf.from_pandas(s, nan_as_null=False) File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/nvtx/nvtx.py:115, in annotate.__call__.<locals>.inner(*args, **kwargs) 112 @wraps(func) 113 def inner(*args, **kwargs): 114 libnvtx_push_range(self.attributes, self.domain.handle) --> 115 result = func(*args, **kwargs) 116 libnvtx_pop_range(self.domain.handle) 117 return result File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/cudf/core/dataframe.py:7874, in from_pandas(obj, nan_as_null) 7872 return DataFrame.from_pandas(obj, nan_as_null=nan_as_null) 7873 elif isinstance(obj, pd.Series): -> 7874 return Series.from_pandas(obj, nan_as_null=nan_as_null) 7875 elif isinstance(obj, pd.MultiIndex): 7876 return MultiIndex.from_pandas(obj, nan_as_null=nan_as_null) File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/nvtx/nvtx.py:115, in annotate.__call__.<locals>.inner(*args, **kwargs) 112 @wraps(func) 113 def inner(*args, **kwargs): 114 libnvtx_push_range(self.attributes, self.domain.handle) --> 115 result = func(*args, **kwargs) 116 libnvtx_pop_range(self.domain.handle) 117 return result File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/cudf/core/series.py:731, in Series.from_pandas(cls, s, nan_as_null) 729 with warnings.catch_warnings(): 730 warnings.simplefilter("ignore") --> 731 result = cls(s, nan_as_null=nan_as_null) 732 return result File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/nvtx/nvtx.py:115, in annotate.__call__.<locals>.inner(*args, **kwargs) 112 @wraps(func) 113 def inner(*args, **kwargs): 114 libnvtx_push_range(self.attributes, self.domain.handle) --> 115 result = func(*args, **kwargs) 116 libnvtx_pop_range(self.domain.handle) 117 return result File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/cudf/core/series.py:651, in Series.__init__(self, data, index, dtype, name, copy, nan_as_null) 633 if not isinstance(data, ColumnBase): 634 # Using `getattr_static` to check if 635 # `data` is on device memory and perform (...) 641 # be expensive or mark a buffer as 642 # unspillable. 643 has_cai = ( 644 type( 645 inspect.getattr_static( (...) 649 is property 650 ) --> 651 data = column.as_column( 652 data, 653 nan_as_null=nan_as_null, 654 dtype=dtype, 655 length=len(index) if index is not None else None, 656 ) 657 if copy and has_cai: 658 data = data.copy(deep=True) File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/cudf/core/column/column.py:2073, in as_column(arbitrary, nan_as_null, dtype, length) 2069 if cudf.get_option( 2070 "mode.pandas_compatible" 2071 ) and _is_pandas_nullable_extension_dtype(arbitrary.dtype): 2072 raise NotImplementedError("not supported") -> 2073 pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null) 2074 if arbitrary.dtype == cudf.dtype("object") and cudf.dtype( 2075 pyarrow_array.type.to_pandas_dtype() 2076 ) != cudf.dtype(arbitrary.dtype): 2077 raise MixedTypeError("Cannot create column with mixed types") File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/pyarrow/array.pxi:323, in pyarrow.lib.array() File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/pyarrow/array.pxi:83, in pyarrow.lib._ndarray_to_array() File /nvme/0/pgali/envs/cudfdev/lib/python3.10/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status() ArrowInvalid: Could not convert 'a' with type str: tried to convert to double In [10]: gs = cudf.from_pandas(s, nan_as_null=True) In [11]: gs.to_pandas() Out[11]: 0 None # In `cudf.pandas` this is problematic because we started with `np.nan` and now ending with `None`. 1 a 2 b dtype: object ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) - Matthew Roeschke (https://github.com/mroeschke) URL: rapidsai/cudf-private#137
- Loading branch information