diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index dd12c92a15a..810cdd51df5 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +import numpy as np import pyarrow as pa import cudf @@ -81,7 +82,14 @@ cpdef generate_pandas_metadata(Table table, index): ): types.append(col.dtype.to_arrow()) else: - types.append(np_to_pa_dtype(col.dtype)) + # A boolean element takes 8 bits in cudf and 1 bit in + # pyarrow. To make sure the cudf format is interperable + # in arrow, we use `int8` type when converting from a + # cudf boolean array. + if col.dtype.type == np.bool_: + types.append(pa.int8()) + else: + types.append(np_to_pa_dtype(col.dtype)) # Indexes if index is not False: @@ -125,7 +133,15 @@ cpdef generate_pandas_metadata(Table table, index): elif is_list_dtype(idx): types.append(col.dtype.to_arrow()) else: - types.append(np_to_pa_dtype(idx.dtype)) + # A boolean element takes 8 bits in cudf and 1 bit in + # pyarrow. To make sure the cudf format is interperable + # in arrow, we use `int8` type when converting from a + # cudf boolean array. + if idx.dtype.type == np.bool_: + types.append(pa.int8()) + else: + types.append(np_to_pa_dtype(idx.dtype)) + index_levels.append(idx) col_names.append(name) index_descriptors.append(descr) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8f18d83eb31..de278db919d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2089,10 +2089,7 @@ def as_column( data ) np_type = np.dtype(dtype).type - if np_type == np.bool_: - pa_type = pa.bool_() - else: - pa_type = np_to_pa_dtype(np.dtype(dtype)) + pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array( arbitrary, diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index d98ab0504cc..877cec24afa 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -324,3 +324,12 @@ def test_dtype(in_dtype, expect): def test_dtype_raise(in_dtype): with pytest.raises(TypeError): cudf.dtype(in_dtype) + + +def test_dtype_np_bool_to_pa_bool(): + """This test case captures that utility np_to_pa_dtype + should map np.bool_ to pa.bool_, nuances on bit width + difference should be handled elsewhere. + """ + + assert np_to_pa_dtype(np.dtype("bool")) == pa.bool_() diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 5100f1a9c49..bdaf5e144a5 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -14,6 +14,11 @@ from cudf.core._compat import PANDAS_GE_120 _NA_REP = "" + +"""Map numpy dtype to pyarrow types. +Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special +handling is required when converting a Boolean column into arrow. +""" _np_pa_dtypes = { np.float64: pa.float64(), np.float32: pa.float32(), @@ -22,7 +27,7 @@ np.int32: pa.int32(), np.int16: pa.int16(), np.int8: pa.int8(), - np.bool_: pa.int8(), + np.bool_: pa.bool_(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(),