Skip to content

Commit

Permalink
Raise NotImplementedError for pd.SparseDtype (#13798)
Browse files Browse the repository at this point in the history
Currently cuDF seems to cast `pd.SparseDtype` to it's subtype instead of maintaining the sparse data type from pandas. Since `pd.SparseDtype` is not supported in cuDF, it is better to raise and tell users to cast directly to the sparse subtype

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #13798
  • Loading branch information
mroeschke authored Aug 3, 2023
1 parent 9c559c9 commit b7994bc
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 0 deletions.
10 changes: 10 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2010,6 +2010,11 @@ def as_column(
return as_column(arbitrary.array)
elif PANDAS_GE_150 and isinstance(arbitrary.dtype, pd.ArrowDtype):
return as_column(pa.array(arbitrary.array, from_pandas=True))
elif isinstance(arbitrary.dtype, pd.SparseDtype):
raise NotImplementedError(
f"{arbitrary.dtype} is not supported. Convert first to "
f"{arbitrary.dtype.subtype}."
)
if is_categorical_dtype(arbitrary):
data = as_column(pa.array(arbitrary, from_pandas=True))
elif is_interval_dtype(arbitrary.dtype):
Expand Down Expand Up @@ -2214,6 +2219,11 @@ def as_column(
)
if dtype is not None:
data = data.astype(dtype)
elif isinstance(arbitrary, pd.arrays.SparseArray):
raise NotImplementedError(
f"{arbitrary.dtype} is not supported. Convert first to "
f"{arbitrary.dtype.subtype}."
)
elif isinstance(arbitrary, memoryview):
data = as_column(
np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null
Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10245,6 +10245,12 @@ def test_dataframe_init_columns_named_index():
assert_eq(gdf, pdf)


def test_dataframe_from_pandas_sparse():
pdf = pd.DataFrame(range(2), dtype=pd.SparseDtype(np.int64, 0))
with pytest.raises(NotImplementedError):
cudf.DataFrame(pdf)


def test_dataframe_constructor_unbounded_sequence():
class A:
def __getitem__(self, key):
Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2206,6 +2206,12 @@ def test_series_contains(data, index):
assert_eq(False in ps, False in gs)


def test_series_from_pandas_sparse():
pser = pd.Series(range(2), dtype=pd.SparseDtype(np.int64, 0))
with pytest.raises(NotImplementedError):
cudf.Series(pser)


def test_series_constructor_unbounded_sequence():
class A:
def __getitem__(self, key):
Expand Down

0 comments on commit b7994bc

Please sign in to comment.