Raise NotImplementedError for pd.SparseDtype (#13798)

Currently cuDF seems to cast `pd.SparseDtype` to it's subtype instead of maintaining the sparse data type from pandas. Since `pd.SparseDtype` is not supported in cuDF, it is better to raise and tell users to cast directly to the sparse subtype Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #13798
rapidsai · Aug 3, 2023 · b7994bc · b7994bc
1 parent 9c559c9
commit b7994bc
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 0 deletions.
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -2010,6 +2010,11 @@ def as_column(
                 return as_column(arbitrary.array)
             elif PANDAS_GE_150 and isinstance(arbitrary.dtype, pd.ArrowDtype):
                 return as_column(pa.array(arbitrary.array, from_pandas=True))
+            elif isinstance(arbitrary.dtype, pd.SparseDtype):
+                raise NotImplementedError(
+                    f"{arbitrary.dtype} is not supported. Convert first to "
+                    f"{arbitrary.dtype.subtype}."
+                )
         if is_categorical_dtype(arbitrary):
             data = as_column(pa.array(arbitrary, from_pandas=True))
         elif is_interval_dtype(arbitrary.dtype):
@@ -2214,6 +2219,11 @@ def as_column(
             )
         if dtype is not None:
             data = data.astype(dtype)
+    elif isinstance(arbitrary, pd.arrays.SparseArray):
+        raise NotImplementedError(
+            f"{arbitrary.dtype} is not supported. Convert first to "
+            f"{arbitrary.dtype.subtype}."
+        )
     elif isinstance(arbitrary, memoryview):
         data = as_column(
             np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -10245,6 +10245,12 @@ def test_dataframe_init_columns_named_index():
     assert_eq(gdf, pdf)
 
 
+def test_dataframe_from_pandas_sparse():
+    pdf = pd.DataFrame(range(2), dtype=pd.SparseDtype(np.int64, 0))
+    with pytest.raises(NotImplementedError):
+        cudf.DataFrame(pdf)
+
+
 def test_dataframe_constructor_unbounded_sequence():
     class A:
         def __getitem__(self, key):

diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -2206,6 +2206,12 @@ def test_series_contains(data, index):
     assert_eq(False in ps, False in gs)
 
 
+def test_series_from_pandas_sparse():
+    pser = pd.Series(range(2), dtype=pd.SparseDtype(np.int64, 0))
+    with pytest.raises(NotImplementedError):
+        cudf.Series(pser)
+
+
 def test_series_constructor_unbounded_sequence():
     class A:
         def __getitem__(self, key):