rapidsai · rapids-bot · Mar 8, 2021 · Feb 18, 2021 · Feb 23, 2021 · Feb 23, 2021
@@ -805,7 +805,7 @@ namespace detail {
  * @brief Convenience function to get offset word from a bitmask
  *
  * @see copy_offset_bitmask
- * @see offset_bitmask_and
+ * @see offset_bitmask_binop
  */
 __device__ inline bitmask_type get_mask_offset_word(bitmask_type const* __restrict__ source,
                                                     size_type destination_word_index,

@@ -87,6 +87,21 @@ rmm::device_buffer copy_bitmask(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc bitmask_binop(Binop op, std::vector<bitmask_type const*>, std::vector<size_type> const&,
+ * size_type, rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+template <typename Binop>
+rmm::device_buffer bitmask_binop(
+  Binop op,
+  std::vector<bitmask_type const *> const &masks,
+  std::vector<size_type> const &begin_bits,
+  size_type mask_size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc bitmask_and(std::vector<bitmask_type const*>, std::vector<size_type> const&, size_type,
  * rmm::mr::device_memory_resource *)
@@ -100,6 +115,18 @@ rmm::device_buffer bitmask_and(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::bitmask_binop
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+template <typename Binop>
+rmm::device_buffer bitmask_binop(
+  Binop op,
+  table_view const &view,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc cudf::bitmask_and
  *
@@ -110,6 +137,29 @@ rmm::device_buffer bitmask_and(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Performs a merger of the specified bitmasks using the binary operator
+ *        provided, and writes in place to destination
+ *
+ * @param op The binary operator used to combine the bitmasks
+ * @param dest_mask Destination to which the AND result is written
+ * @param masks The list of data pointers of the bitmasks to be ANDed
+ * @param begin_bits The bit offsets from which each mask is to be ANDed
+ * @param mask_size The number of bits to be ANDed in each mask
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned device_buffer
+ * @return rmm::device_buffer Output bitmask
+ */
+template <typename Binop>
+void inplace_bitmask_binop(
+  Binop op,
+  bitmask_type *dest_mask,
+  std::vector<bitmask_type const *> const &masks,
+  std::vector<size_type> const &begin_bits,
+  size_type mask_size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs a bitwise AND of the specified bitmasks,
  *        and writes in place to destination

@@ -220,5 +220,19 @@ rmm::device_buffer bitmask_and(
   table_view const& view,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a bitwise OR of the bitmasks of columns of a table
+ *
+ * If any of the columns isn't nullable, it is considered all valid.
+ * If no column in the table is nullable, an empty bitmask is returned.
+ *
+ * @param view The table of columns
+ * @param mr Device memory resource used to allocate the returned device_buffer
+ * @return rmm::device_buffer Output bitmask
+ */
+rmm::device_buffer bitmask_or(
+  table_view const& view,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
@@ -316,8 +316,9 @@ __global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination,
 }
 
 /**
- * @brief Computes the bitwise AND of an array of bitmasks
+ * @brief Computes the merger of an array of bitmasks using a binary operator
  *
+ * @param op The binary operator used to combine the bitmasks
  * @param destination The bitmask to write result into
  * @param source Array of source mask pointers. All masks must be of same size
  * @param begin_bit Array of offsets into corresponding @p source masks.
@@ -326,20 +327,25 @@ __global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination,
  * @param source_size Number of bits in each mask in @p source
  * @param number_of_mask_words The number of words of type bitmask_type to copy
  */
-__global__ void offset_bitmask_and(bitmask_type *__restrict__ destination,
-                                   bitmask_type const *const *__restrict__ source,
-                                   size_type const *__restrict__ begin_bit,
-                                   size_type num_sources,
-                                   size_type source_size,
-                                   size_type number_of_mask_words)
+template <typename Binop>
+__global__ void offset_bitmask_binop(Binop op,
+                                     bitmask_type *__restrict__ destination,
+                                     bitmask_type const *const *__restrict__ source,
+                                     size_type const *__restrict__ begin_bit,
+                                     size_type num_sources,
+                                     size_type source_size,
+                                     size_type number_of_mask_words)
 {
   for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
        destination_word_index < number_of_mask_words;
        destination_word_index += blockDim.x * gridDim.x) {
-    bitmask_type destination_word = ~bitmask_type{0};  // All bits 1
-    for (size_type i = 0; i < num_sources; i++) {
-      destination_word &= detail::get_mask_offset_word(
-        source[i], destination_word_index, begin_bit[i], begin_bit[i] + source_size);
+    bitmask_type destination_word = detail::get_mask_offset_word(
+      source[0], destination_word_index, begin_bit[0], begin_bit[0] + source_size);
+    for (size_type i = 1; i < num_sources; i++) {
+      destination_word =
+        op(destination_word,
+           detail::get_mask_offset_word(
+             source[i], destination_word_index, begin_bit[i], begin_bit[i] + source_size));
     }
 
     destination[destination_word_index] = destination_word;
@@ -420,13 +426,14 @@ rmm::device_buffer copy_bitmask(column_view const &view,
   return null_mask;
 }
 
-// Inplace Bitwise AND of the masks
-void inplace_bitmask_and(bitmask_type *dest_mask,
-                         std::vector<bitmask_type const *> const &masks,
-                         std::vector<size_type> const &begin_bits,
-                         size_type mask_size,
-                         rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource *mr)
+template <typename Binop>
+void inplace_bitmask_binop(Binop op,
+                           bitmask_type *dest_mask,
+                           std::vector<bitmask_type const *> const &masks,
+                           std::vector<size_type> const &begin_bits,
+                           size_type mask_size,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(std::all_of(begin_bits.begin(), begin_bits.end(), [](auto b) { return b >= 0; }),
                "Invalid range.");
@@ -440,7 +447,8 @@ void inplace_bitmask_and(bitmask_type *dest_mask,
   rmm::device_vector<size_type> d_begin_bits(begin_bits);
 
   cudf::detail::grid_1d config(number_of_mask_words, 256);
-  offset_bitmask_and<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+  offset_bitmask_binop<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+    op,
     dest_mask,
     d_masks.data().get(),
     d_begin_bits.data().get(),
@@ -451,23 +459,58 @@ void inplace_bitmask_and(bitmask_type *dest_mask,
   CHECK_CUDA(stream.value());
 }
 
-// Bitwise AND of the masks
-rmm::device_buffer bitmask_and(std::vector<bitmask_type const *> const &masks,
-                               std::vector<size_type> const &begin_bits,
-                               size_type mask_size,
-                               rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource *mr)
+// Inplace Bitwise AND of the masks
+void inplace_bitmask_and(bitmask_type *dest_mask,
+                         std::vector<bitmask_type const *> const &masks,
+                         std::vector<size_type> const &begin_bits,
+                         size_type mask_size,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource *mr)
+{
+  inplace_bitmask_binop(
+    [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
+    dest_mask,
+    masks,
+    begin_bits,
+    mask_size,
+    stream,
+    mr);
+}
+
+template <typename Binop>
+rmm::device_buffer bitmask_binop(Binop op,
+                                 std::vector<bitmask_type const *> const &masks,
+                                 std::vector<size_type> const &begin_bits,
+                                 size_type mask_size,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource *mr)
 {
   rmm::device_buffer dest_mask{};
   auto num_bytes = bitmask_allocation_size_bytes(mask_size);
 
   dest_mask = rmm::device_buffer{num_bytes, stream, mr};
-  inplace_bitmask_and(
-    static_cast<bitmask_type *>(dest_mask.data()), masks, begin_bits, mask_size, stream, mr);
+  inplace_bitmask_binop(
+    op, static_cast<bitmask_type *>(dest_mask.data()), masks, begin_bits, mask_size, stream, mr);
 
   return dest_mask;
 }
 
+// Bitwise AND of the masks
+rmm::device_buffer bitmask_and(std::vector<bitmask_type const *> const &masks,
+                               std::vector<size_type> const &begin_bits,
+                               size_type mask_size,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource *mr)
+{
+  return bitmask_binop(
+    [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
+    masks,
+    begin_bits,
+    mask_size,
+    stream,
+    mr);
+}
+
 cudf::size_type count_set_bits(bitmask_type const *bitmask,
                                size_type start,
                                size_type stop,
@@ -631,10 +674,12 @@ std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
   return ret;
 }
 
-// Returns the bitwise AND of the null masks of all columns in the table view
-rmm::device_buffer bitmask_and(table_view const &view,
-                               rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource *mr)
+// Returns the merged null masks of all columns in the table view
+template <typename Binop>
+rmm::device_buffer bitmask_binop(Binop op,
+                                 table_view const &view,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource *mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
@@ -650,12 +695,34 @@ rmm::device_buffer bitmask_and(table_view const &view,
   }
 
   if (masks.size() > 0) {
-    return cudf::detail::bitmask_and(masks, offsets, view.num_rows(), stream, mr);
+    return cudf::detail::bitmask_binop(op, masks, offsets, view.num_rows(), stream, mr);
   }
 
   return null_mask;
 }
 
+// Returns the bitwise AND of the null masks of all columns in the table view
+rmm::device_buffer bitmask_and(table_view const &view,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource *mr)
+{
+  return bitmask_binop(
+    [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
+    view,
+    stream,
+    mr);
+}
+
+rmm::device_buffer bitmask_or(table_view const &view,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource *mr)
+{
+  return bitmask_binop(
+    [] __device__(bitmask_type left, bitmask_type right) { return left | right; },
+    view,
+    stream,
+    mr);
+}
 }  // namespace detail
 
 // Count non-zero bits in the specified range
@@ -708,4 +775,9 @@ rmm::device_buffer bitmask_and(table_view const &view, rmm::mr::device_memory_re
   return detail::bitmask_and(view, rmm::cuda_stream_default, mr);
 }
 
+rmm::device_buffer bitmask_or(table_view const &view, rmm::mr::device_memory_resource *mr)
+{
+  return detail::bitmask_or(view, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace cudf
@@ -520,7 +520,7 @@ public final ColumnVector normalizeNANsAndZeros() {
    * @return the new ColumnVector with merged null mask.
    */
   public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... columns) {
-    assert mergeOp == BinaryOp.BITWISE_AND : "Only BITWISE_AND supported right now";
+    assert mergeOp == BinaryOp.BITWISE_AND || mergeOp == BinaryOp.BITWISE_OR : "Only BITWISE_AND and BITWISE_OR supported right now";
     long[] columnViews = new long[columns.length];
     long size = getRowCount();
 

@@ -1305,10 +1305,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
     cudf::table_view *input_table = new cudf::table_view(column_views);
 
     cudf::binary_operator op = static_cast<cudf::binary_operator>(bin_op);
-    if(op == cudf::binary_operator::BITWISE_AND) {
-      copy->set_null_mask(cudf::bitmask_and(*input_table));
+    switch(op) {
+      case cudf::binary_operator::BITWISE_AND:
+        copy->set_null_mask(cudf::bitmask_and(*input_table));
+        break;
+      case cudf::binary_operator::BITWISE_OR:
+        copy->set_null_mask(cudf::bitmask_or(*input_table));
+        break;
+      default:
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Unsupported merge operation", 0);
     }
-
     return reinterpret_cast<jlong>(copy.release());
   }
   CATCH_STD(env, 0);

@@ -571,7 +571,7 @@ void testSpark32BitMurmur3HashMixed() {
   }
 
   @Test
-  void testNullReconfigureNulls() {
+  void testAndNullReconfigureNulls() {
     try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
          ColumnVector v1 = ColumnVector.fromBoxedInts(0, 100, 1, 2, Integer.MIN_VALUE, null);
          ColumnVector intResult = v1.mergeAndSetValidity(BinaryOp.BITWISE_AND, v0);
@@ -585,6 +585,25 @@ void testNullReconfigureNulls() {
     }
   }
 
+  @Test
+  void testOrNullReconfigureNulls() {
+    try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
+         ColumnVector v1 = ColumnVector.fromBoxedInts(0, 100, 1, 2, Integer.MIN_VALUE, null);
+         ColumnVector intResultV0 = v1.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0);
+         ColumnVector intResultV0V1 = v1.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0, v1);
+         ColumnVector intResultMulti = v1.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0, v0, v1, v1, v0, v1, v0);
+         ColumnVector v2 = ColumnVector.fromStrings("0", "100", "1", "2", "MIN_VALUE", "3");
+         ColumnVector stringResult = v2.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0, v1);
+         ColumnVector stringExpected = ColumnVector.fromStrings("0", "100", "1", "2", "MIN_VALUE", null);
+         ColumnVector noMaskResult = v2.mergeAndSetValidity(BinaryOp.BITWISE_OR)) {
+      assertColumnsAreEqual(v0, intResultV0);
+      assertColumnsAreEqual(v1, intResultV0V1);
+      assertColumnsAreEqual(v1, intResultMulti);
+      assertColumnsAreEqual(stringExpected, stringResult);
+      assertColumnsAreEqual(v2, noMaskResult);
+    }
+  }
+
   @Test
   void isNotNullTestEmptyColumn() {
     try (ColumnVector v = ColumnVector.fromBoxedInts();