Skip to content

Commit

Permalink
Explicit about bitwidth difference between cudf boolean and arrow boo…
Browse files Browse the repository at this point in the history
…lean (#9192)

Currently, we map boolean type to `pa.int8` because the bitwidth of cudf boolean mismatches that in arrow. However the implication of this mapping is subtle and may cause unwanted result such as:

```python
>>> cudf.StructDtype({
    "a": np.bool_,
    "b": np.int8,
})
StructDtype({'a': dtype('int8'), 'b': dtype('int8')})
```

This PR changes the mapping back to `pa.bool_`, and use explicit type handling when we are dealing with type conversion to arrow.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - https://github.com/brandon-b-miller
  - H. Thomson Comer (https://github.com/thomcom)

URL: #9192
  • Loading branch information
isVoid authored Sep 22, 2021
1 parent 9da7c01 commit 20713df
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 7 deletions.
20 changes: 18 additions & 2 deletions python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

import numpy as np
import pyarrow as pa

import cudf
Expand Down Expand Up @@ -81,7 +82,14 @@ cpdef generate_pandas_metadata(Table table, index):
):
types.append(col.dtype.to_arrow())
else:
types.append(np_to_pa_dtype(col.dtype))
# A boolean element takes 8 bits in cudf and 1 bit in
# pyarrow. To make sure the cudf format is interperable
# in arrow, we use `int8` type when converting from a
# cudf boolean array.
if col.dtype.type == np.bool_:
types.append(pa.int8())
else:
types.append(np_to_pa_dtype(col.dtype))

# Indexes
if index is not False:
Expand Down Expand Up @@ -125,7 +133,15 @@ cpdef generate_pandas_metadata(Table table, index):
elif is_list_dtype(idx):
types.append(col.dtype.to_arrow())
else:
types.append(np_to_pa_dtype(idx.dtype))
# A boolean element takes 8 bits in cudf and 1 bit in
# pyarrow. To make sure the cudf format is interperable
# in arrow, we use `int8` type when converting from a
# cudf boolean array.
if idx.dtype.type == np.bool_:
types.append(pa.int8())
else:
types.append(np_to_pa_dtype(idx.dtype))

index_levels.append(idx)
col_names.append(name)
index_descriptors.append(descr)
Expand Down
5 changes: 1 addition & 4 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2089,10 +2089,7 @@ def as_column(
data
)
np_type = np.dtype(dtype).type
if np_type == np.bool_:
pa_type = pa.bool_()
else:
pa_type = np_to_pa_dtype(np.dtype(dtype))
pa_type = np_to_pa_dtype(np.dtype(dtype))
data = as_column(
pa.array(
arbitrary,
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/tests/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,3 +324,12 @@ def test_dtype(in_dtype, expect):
def test_dtype_raise(in_dtype):
with pytest.raises(TypeError):
cudf.dtype(in_dtype)


def test_dtype_np_bool_to_pa_bool():
"""This test case captures that utility np_to_pa_dtype
should map np.bool_ to pa.bool_, nuances on bit width
difference should be handled elsewhere.
"""

assert np_to_pa_dtype(np.dtype("bool")) == pa.bool_()
7 changes: 6 additions & 1 deletion python/cudf/cudf/utils/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
from cudf.core._compat import PANDAS_GE_120

_NA_REP = "<NA>"

"""Map numpy dtype to pyarrow types.
Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
handling is required when converting a Boolean column into arrow.
"""
_np_pa_dtypes = {
np.float64: pa.float64(),
np.float32: pa.float32(),
Expand All @@ -22,7 +27,7 @@
np.int32: pa.int32(),
np.int16: pa.int16(),
np.int8: pa.int8(),
np.bool_: pa.int8(),
np.bool_: pa.bool_(),
np.uint64: pa.uint64(),
np.uint32: pa.uint32(),
np.uint16: pa.uint16(),
Expand Down

0 comments on commit 20713df

Please sign in to comment.