Explicit about bitwidth difference between cudf boolean and arrow boo…

…lean (#9192) Currently, we map boolean type to `pa.int8` because the bitwidth of cudf boolean mismatches that in arrow. However the implication of this mapping is subtle and may cause unwanted result such as: ```python >>> cudf.StructDtype({ "a": np.bool_, "b": np.int8, }) StructDtype({'a': dtype('int8'), 'b': dtype('int8')}) ``` This PR changes the mapping back to `pa.bool_`, and use explicit type handling when we are dealing with type conversion to arrow. Authors: - Michael Wang (https://github.com/isVoid) Approvers: - https://github.com/brandon-b-miller - H. Thomson Comer (https://github.com/thomcom) URL: #9192
rapidsai · Sep 22, 2021 · 20713df · 20713df
1 parent 9da7c01
commit 20713df
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 7 deletions.
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
+import numpy as np
 import pyarrow as pa
 
 import cudf
@@ -81,7 +82,14 @@ cpdef generate_pandas_metadata(Table table, index):
         ):
             types.append(col.dtype.to_arrow())
         else:
-            types.append(np_to_pa_dtype(col.dtype))
+            # A boolean element takes 8 bits in cudf and 1 bit in
+            # pyarrow. To make sure the cudf format is interperable
+            # in arrow, we use `int8` type when converting from a
+            # cudf boolean array.
+            if col.dtype.type == np.bool_:
+                types.append(pa.int8())
+            else:
+                types.append(np_to_pa_dtype(col.dtype))
 
     # Indexes
     if index is not False:
@@ -125,7 +133,15 @@ cpdef generate_pandas_metadata(Table table, index):
                 elif is_list_dtype(idx):
                     types.append(col.dtype.to_arrow())
                 else:
-                    types.append(np_to_pa_dtype(idx.dtype))
+                    # A boolean element takes 8 bits in cudf and 1 bit in
+                    # pyarrow. To make sure the cudf format is interperable
+                    # in arrow, we use `int8` type when converting from a
+                    # cudf boolean array.
+                    if idx.dtype.type == np.bool_:
+                        types.append(pa.int8())
+                    else:
+                        types.append(np_to_pa_dtype(idx.dtype))
+
                 index_levels.append(idx)
             col_names.append(name)
             index_descriptors.append(descr)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -2089,10 +2089,7 @@ def as_column(
                             data
                         )
                     np_type = np.dtype(dtype).type
-                    if np_type == np.bool_:
-                        pa_type = pa.bool_()
-                    else:
-                        pa_type = np_to_pa_dtype(np.dtype(dtype))
+                    pa_type = np_to_pa_dtype(np.dtype(dtype))
                 data = as_column(
                     pa.array(
                         arbitrary,

diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
@@ -324,3 +324,12 @@ def test_dtype(in_dtype, expect):
 def test_dtype_raise(in_dtype):
     with pytest.raises(TypeError):
         cudf.dtype(in_dtype)
+
+
+def test_dtype_np_bool_to_pa_bool():
+    """This test case captures that utility np_to_pa_dtype
+    should map np.bool_ to pa.bool_, nuances on bit width
+    difference should be handled elsewhere.
+    """
+
+    assert np_to_pa_dtype(np.dtype("bool")) == pa.bool_()
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
@@ -14,6 +14,11 @@
 from cudf.core._compat import PANDAS_GE_120
 
 _NA_REP = "<NA>"
+
+"""Map numpy dtype to pyarrow types.
+Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
+handling is required when converting a Boolean column into arrow.
+"""
 _np_pa_dtypes = {
     np.float64: pa.float64(),
     np.float32: pa.float32(),
@@ -22,7 +27,7 @@
     np.int32: pa.int32(),
     np.int16: pa.int16(),
     np.int8: pa.int8(),
-    np.bool_: pa.int8(),
+    np.bool_: pa.bool_(),
     np.uint64: pa.uint64(),
     np.uint32: pa.uint32(),
     np.uint16: pa.uint16(),