rapidsai · rapids-bot · Mar 8, 2021 · Feb 18, 2021 · Feb 23, 2021 · Feb 23, 2021
@@ -805,7 +805,7 @@ namespace detail {
  * @brief Convenience function to get offset word from a bitmask
  *
  * @see copy_offset_bitmask
- * @see offset_bitmask_and
+ * @see offset_bitmask_binop
  */
 __device__ inline bitmask_type get_mask_offset_word(bitmask_type const* __restrict__ source,
                                                     size_type destination_word_index,

@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/null_mask.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+using cudf::detail::device_span;
+
+namespace cudf {
+namespace {
+/**
+ * @brief Computes the merger of an array of bitmasks using a binary operator
+ *
+ * @param op The binary operator used to combine the bitmasks
+ * @param destination The bitmask to write result into
+ * @param source Array of source mask pointers. All masks must be of same size
+ * @param begin_bit Array of offsets into corresponding @p source masks.
+ *                  Must be same size as source array
+ * @param num_sources Number of masks in @p source array
+ * @param source_size Number of bits in each mask in @p source
+ * @param number_of_mask_words The number of words of type bitmask_type to copy
+ */
+template <typename Binop>
+__global__ void offset_bitmask_binop(Binop op,
+                                     device_span<bitmask_type> destination,
+                                     device_span<bitmask_type const *> source,
+                                     device_span<size_type> begin_bit,
+                                     size_type source_size)
+{
+  for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
+       destination_word_index < destination.size();
+       destination_word_index += blockDim.x * gridDim.x) {
+    bitmask_type destination_word = detail::get_mask_offset_word(
+      source[0], destination_word_index, begin_bit[0], begin_bit[0] + source_size);
+    for (size_type i = 1; i < source.size(); i++) {
+      destination_word =
+        op(destination_word,
+           detail::get_mask_offset_word(
+             source[i], destination_word_index, begin_bit[i], begin_bit[i] + source_size));
+    }
+
+    destination[destination_word_index] = destination_word;
+  }
+}
+}  // namespace
+namespace detail {
+/**
+ * @copydoc bitmask_binop(Binop op, std::vector<bitmask_type const*>, std::vector<size_type> const&,
+ * size_type, rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+template <typename Binop>
+rmm::device_buffer bitmask_binop(
+  Binop op,
+  std::vector<bitmask_type const *> const &masks,
+  std::vector<size_type> const &begin_bits,
+  size_type mask_size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+{
+  CUDF_EXPECTS(std::all_of(begin_bits.begin(), begin_bits.end(), [](auto b) { return b >= 0; }),
+               "Invalid range.");
+  CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
+               "Mask pointer cannot be null");
+  rmm::device_buffer dest_mask{};
+  auto num_bytes = bitmask_allocation_size_bytes(mask_size);
+
+  rmm::device_vector<bitmask_type const *> d_masks(masks);
+  rmm::device_vector<size_type> d_begin_bits(begin_bits);
+
+  dest_mask = rmm::device_buffer{num_bytes, stream, mr};
+
+  inplace_bitmask_binop(op,
+                        device_span<bitmask_type>(static_cast<bitmask_type *>(dest_mask.data()),
+                                                  num_bitmask_words(mask_size)),
+                        device_span<bitmask_type const *>(d_masks.data().get(), d_masks.size()),
+                        device_span<size_type>(d_begin_bits.data().get(), d_begin_bits.size()),
+                        mask_size,
+                        stream,
+                        mr);
+
+  return dest_mask;
+}
+
+/**
+ * @brief Performs a merger of the specified bitmasks using the binary operator
+ *        provided, and writes in place to destination
+ *
+ * @param op The binary operator used to combine the bitmasks
+ * @param dest_mask Destination to which the AND result is written
+ * @param masks The list of data pointers of the bitmasks to be ANDed
+ * @param begin_bits The bit offsets from which each mask is to be ANDed
+ * @param mask_size The number of bits to be ANDed in each mask
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned device_buffer
+ * @return rmm::device_buffer Output bitmask
+ */
+template <typename Binop>
+void inplace_bitmask_binop(
+  Binop op,
+  device_span<bitmask_type> dest_mask,
+  device_span<bitmask_type const *> masks,
+  device_span<size_type> begin_bits,
+  size_type mask_size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+{
+  CUDF_EXPECTS(mask_size > 0, "Invalid bit range.");
+
+  cudf::detail::grid_1d config(dest_mask.size(), 256);
+  offset_bitmask_binop<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+    op, dest_mask, masks, begin_bits, mask_size);
+  CHECK_CUDA(stream.value());
+}
+
+}  // namespace detail
+
+}  // namespace cudf
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 

@@ -220,5 +220,19 @@ rmm::device_buffer bitmask_and(
   table_view const& view,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a bitwise OR of the bitmasks of columns of a table
+ *
+ * If any of the columns isn't nullable, it is considered all valid.
+ * If no column in the table is nullable, an empty bitmask is returned.
+ *
+ * @param view The table of columns
+ * @param mr Device memory resource used to allocate the returned device_buffer
+ * @return rmm::device_buffer Output bitmask
+ */
+rmm::device_buffer bitmask_or(
+  table_view const& view,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/null_mask.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -23,6 +24,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -41,6 +43,8 @@
 #include <numeric>
 #include <type_traits>
 
+using cudf::detail::device_span;
+
 namespace cudf {
 size_type state_null_count(mask_state state, size_type size)
 {
@@ -315,37 +319,6 @@ __global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination,
   }
 }
 
-/**
- * @brief Computes the bitwise AND of an array of bitmasks
- *
- * @param destination The bitmask to write result into
- * @param source Array of source mask pointers. All masks must be of same size
- * @param begin_bit Array of offsets into corresponding @p source masks.
- *                  Must be same size as source array
- * @param num_sources Number of masks in @p source array
- * @param source_size Number of bits in each mask in @p source
- * @param number_of_mask_words The number of words of type bitmask_type to copy
- */
-__global__ void offset_bitmask_and(bitmask_type *__restrict__ destination,
-                                   bitmask_type const *const *__restrict__ source,
-                                   size_type const *__restrict__ begin_bit,
-                                   size_type num_sources,
-                                   size_type source_size,
-                                   size_type number_of_mask_words)
-{
-  for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
-       destination_word_index < number_of_mask_words;
-       destination_word_index += blockDim.x * gridDim.x) {
-    bitmask_type destination_word = ~bitmask_type{0};  // All bits 1
-    for (size_type i = 0; i < num_sources; i++) {
-      destination_word &= detail::get_mask_offset_word(
-        source[i], destination_word_index, begin_bit[i], begin_bit[i] + source_size);
-    }
-
-    destination[destination_word_index] = destination_word;
-  }
-}
-
 // convert [first_bit_index,last_bit_index) to
 // [first_word_index,last_word_index)
 struct to_word_index : public thrust::unary_function<size_type, size_type> {
@@ -430,25 +403,20 @@ void inplace_bitmask_and(bitmask_type *dest_mask,
 {
   CUDF_EXPECTS(std::all_of(begin_bits.begin(), begin_bits.end(), [](auto b) { return b >= 0; }),
                "Invalid range.");
-  CUDF_EXPECTS(mask_size > 0, "Invalid bit range.");
   CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
                "Mask pointer cannot be null");
 
-  auto number_of_mask_words = num_bitmask_words(mask_size);
-
   rmm::device_vector<bitmask_type const *> d_masks(masks);
   rmm::device_vector<size_type> d_begin_bits(begin_bits);
 
-  cudf::detail::grid_1d config(number_of_mask_words, 256);
-  offset_bitmask_and<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-    dest_mask,
-    d_masks.data().get(),
-    d_begin_bits.data().get(),
-    d_masks.size(),
+  inplace_bitmask_binop(
+    [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
+    device_span<bitmask_type>(dest_mask, num_bitmask_words(mask_size)),
+    device_span<bitmask_type const *>(d_masks.data().get(), d_masks.size()),
+    device_span<size_type>(d_begin_bits.data().get(), d_begin_bits.size()),
     mask_size,
-    number_of_mask_words);
-
-  CHECK_CUDA(stream.value());
+    stream,
+    mr);
 }
 
 // Bitwise AND of the masks
@@ -458,14 +426,13 @@ rmm::device_buffer bitmask_and(std::vector<bitmask_type const *> const &masks,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource *mr)
 {
-  rmm::device_buffer dest_mask{};
-  auto num_bytes = bitmask_allocation_size_bytes(mask_size);
-
-  dest_mask = rmm::device_buffer{num_bytes, stream, mr};
-  inplace_bitmask_and(
-    static_cast<bitmask_type *>(dest_mask.data()), masks, begin_bits, mask_size, stream, mr);
-
-  return dest_mask;
+  return bitmask_binop(
+    [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
+    masks,
+    begin_bits,
+    mask_size,
+    stream,
+    mr);
 }
 
 cudf::size_type count_set_bits(bitmask_type const *bitmask,
@@ -650,12 +617,47 @@ rmm::device_buffer bitmask_and(table_view const &view,
   }
 
   if (masks.size() > 0) {
-    return cudf::detail::bitmask_and(masks, offsets, view.num_rows(), stream, mr);
+    return cudf::detail::bitmask_binop(
+      [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
+      masks,
+      offsets,
+      view.num_rows(),
+      stream,
+      mr);
   }
 
   return null_mask;
 }
 
+rmm::device_buffer bitmask_or(table_view const &view,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource *mr)
+{
+  CUDF_FUNC_RANGE();
+  rmm::device_buffer null_mask{0, stream, mr};
+  if (view.num_rows() == 0 or view.num_columns() == 0) { return null_mask; }
+
+  std::vector<bitmask_type const *> masks;
+  std::vector<size_type> offsets;
+  for (auto &&col : view) {
+    if (col.nullable()) {
+      masks.push_back(col.null_mask());
+      offsets.push_back(col.offset());
+    }
+  }
+
+  if (static_cast<size_type>(masks.size()) == view.num_columns()) {
+    return cudf::detail::bitmask_binop(
+      [] __device__(bitmask_type left, bitmask_type right) { return left | right; },
+      masks,
+      offsets,
+      view.num_rows(),
+      stream,
+      mr);
+  }
+
+  return null_mask;
+}
 }  // namespace detail
 
 // Count non-zero bits in the specified range
@@ -708,4 +710,9 @@ rmm::device_buffer bitmask_and(table_view const &view, rmm::mr::device_memory_re
   return detail::bitmask_and(view, rmm::cuda_stream_default, mr);
 }
 
+rmm::device_buffer bitmask_or(table_view const &view, rmm::mr::device_memory_resource *mr)
+{
+  return detail::bitmask_or(view, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace cudf
@@ -520,7 +520,7 @@ public final ColumnVector normalizeNANsAndZeros() {
    * @return the new ColumnVector with merged null mask.
    */
   public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... columns) {
-    assert mergeOp == BinaryOp.BITWISE_AND : "Only BITWISE_AND supported right now";
+    assert mergeOp == BinaryOp.BITWISE_AND || mergeOp == BinaryOp.BITWISE_OR : "Only BITWISE_AND and BITWISE_OR supported right now";
     long[] columnViews = new long[columns.length];
     long size = getRowCount();
 

@@ -1305,10 +1305,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
     cudf::table_view *input_table = new cudf::table_view(column_views);
 
     cudf::binary_operator op = static_cast<cudf::binary_operator>(bin_op);
-    if(op == cudf::binary_operator::BITWISE_AND) {
-      copy->set_null_mask(cudf::bitmask_and(*input_table));
+    switch(op) {
+      case cudf::binary_operator::BITWISE_AND:
+        copy->set_null_mask(cudf::bitmask_and(*input_table));
+        break;
+      case cudf::binary_operator::BITWISE_OR:
+        copy->set_null_mask(cudf::bitmask_or(*input_table));
+        break;
+      default:
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Unsupported merge operation", 0);
     }
-
     return reinterpret_cast<jlong>(copy.release());
   }
   CATCH_STD(env, 0);