Skip to content

Commit

Permalink
Raise MixedTypeError when a mix of bools and None are stored in…
Browse files Browse the repository at this point in the history
… pandas columns (#3)

Fixes: rapidsai/xdf#322

This PR raises an error when a pandas column with a mix of bools & None are detected i.e., when a boolean column is of type object rather than bool/boolean.
  • Loading branch information
galipremsagar authored Sep 30, 2023
1 parent abb95fe commit 952f2bc
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 15 deletions.
14 changes: 3 additions & 11 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2061,14 +2061,9 @@ def as_column(
)
else:
pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null)
if (
arbitrary.dtype == cudf.dtype("object")
and cudf.dtype(pyarrow_array.type.to_pandas_dtype())
!= cudf.dtype(arbitrary.dtype)
and not is_bool_dtype(
cudf.dtype(pyarrow_array.type.to_pandas_dtype())
)
):
if arbitrary.dtype == cudf.dtype("object") and cudf.dtype(
pyarrow_array.type.to_pandas_dtype()
) != cudf.dtype(arbitrary.dtype):
raise MixedTypeError("Cannot create column with mixed types")
if isinstance(pyarrow_array.type, pa.Decimal128Type):
pyarrow_type = cudf.Decimal128Dtype.from_arrow(
Expand Down Expand Up @@ -2458,9 +2453,6 @@ def as_column(
and (
cudf.dtype(pyarrow_array.type.to_pandas_dtype())
!= cudf.dtype(arbitrary.dtype)
and not is_bool_dtype(
cudf.dtype(pyarrow_array.type.to_pandas_dtype())
)
)
):
raise MixedTypeError(
Expand Down
8 changes: 5 additions & 3 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4029,7 +4029,7 @@ def test_any(data, axis):
dtype = None if data else float
if np.array(data).ndim <= 1:
pdata = pd.Series(data=data, dtype=dtype)
gdata = cudf.Series.from_pandas(pdata)
gdata = cudf.Series(data=data, dtype=dtype)

if axis == 1:
with pytest.raises(NotImplementedError):
Expand Down Expand Up @@ -8949,8 +8949,10 @@ def test_agg_for_dataframe_with_string_columns(aggs):
"c": pd.Series([2, np.nan, 5.0], index=[2, 3, 4]),
},
{
"a": [True, np.nan, True],
"d": pd.Series([False, True, np.nan], index=[0, 1, 3]),
"a": pd.Series([True, None, True], dtype=pd.BooleanDtype()),
"d": pd.Series(
[False, True, None], index=[0, 1, 3], dtype=pd.BooleanDtype()
),
},
],
)
Expand Down
12 changes: 11 additions & 1 deletion python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2205,7 +2205,7 @@ def test_series_mixed_dtype_error(dtype):
@pytest.mark.parametrize("index", [None, [10, 20, 30]])
def test_series_contains(data, index):
ps = pd.Series(data, index=index)
gs = cudf.from_pandas(ps)
gs = cudf.Series(data, index=index)

assert_eq(1 in ps, 1 in gs)
assert_eq(10 in ps, 10 in gs)
Expand Down Expand Up @@ -2326,3 +2326,13 @@ def test_series_count_invalid_param():
s = cudf.Series([], dtype="float64")
with pytest.raises(TypeError):
s.count(skipna=True)


def test_bool_series_mixed_dtype_error():
ps = pd.Series([True, False, None])
# ps now has `object` dtype, which
# isn't supported by `cudf`.
with pytest.raises(TypeError):
cudf.Series(ps)
with pytest.raises(TypeError):
cudf.from_pandas(ps)

0 comments on commit 952f2bc

Please sign in to comment.