bitmask_or implementation with bitmask refactor (rapidsai#7406)

Refactors the bitmask merging functionality to support any binary function, allowing for `bitwise_or` support in addition the existing `bitwise_and` support. Includes changes to the Java api and JNI to access the `bitwise_or` functionality. Authors: - @rwlee Approvers: - Jason Lowe (@jlowe) - Jake Hemstad (@jrhemstad) - Christopher Harris (@cwharris) URL: rapidsai#7406
hyperbolic2346 · Mar 23, 2021 · e6659ca · e6659ca
1 parent 7b02cb1
commit e6659ca
Show file tree

Hide file tree

Showing 10 changed files with 357 additions and 100 deletions.
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
@@ -856,7 +856,7 @@ namespace detail {
  * @brief Convenience function to get offset word from a bitmask
  *
  * @see copy_offset_bitmask
- * @see offset_bitmask_and
+ * @see offset_bitmask_binop
  */
 __device__ inline bitmask_type get_mask_offset_word(bitmask_type const* __restrict__ source,
                                                     size_type destination_word_index,

diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/null_mask.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+using cudf::detail::device_span;
+
+namespace cudf {
+namespace detail {
+/**
+ * @brief Computes the merger of an array of bitmasks using a binary operator
+ *
+ * @param op The binary operator used to combine the bitmasks
+ * @param destination The bitmask to write result into
+ * @param source Array of source mask pointers. All masks must be of same size
+ * @param source_begin_bits Array of offsets into corresponding @p source masks.
+ *                          Must be same size as source array
+ * @param source_size_bits Number of bits in each mask in @p source
+ */
+template <typename Binop>
+__global__ void offset_bitmask_binop(Binop op,
+                                     device_span<bitmask_type> destination,
+                                     device_span<bitmask_type const *> source,
+                                     device_span<size_type const> source_begin_bits,
+                                     size_type source_size_bits)
+{
+  for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
+       destination_word_index < destination.size();
+       destination_word_index += blockDim.x * gridDim.x) {
+    bitmask_type destination_word =
+      detail::get_mask_offset_word(source[0],
+                                   destination_word_index,
+                                   source_begin_bits[0],
+                                   source_begin_bits[0] + source_size_bits);
+    for (size_type i = 1; i < source.size(); i++) {
+      destination_word =
+
+        op(destination_word,
+           detail::get_mask_offset_word(source[i],
+                                        destination_word_index,
+                                        source_begin_bits[i],
+                                        source_begin_bits[i] + source_size_bits));
+    }
+
+    destination[destination_word_index] = destination_word;
+  }
+}
+
+/**
+ * @copydoc bitmask_binop(Binop op, host_span<bitmask_type const *> const, host_span<size_type>
+ * const, size_type, rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+template <typename Binop>
+rmm::device_buffer bitmask_binop(
+  Binop op,
+  host_span<bitmask_type const *> masks,
+  host_span<size_type const> masks_begin_bits,
+  size_type mask_size_bits,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+{
+  auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr};
+
+  inplace_bitmask_binop(op,
+                        device_span<bitmask_type>(static_cast<bitmask_type *>(dest_mask.data()),
+                                                  num_bitmask_words(mask_size_bits)),
+                        masks,
+                        masks_begin_bits,
+                        mask_size_bits,
+                        stream,
+                        mr);
+
+  return dest_mask;
+}
+
+/**
+ * @brief Performs a merge of the specified bitmasks using the binary operator
+ *        provided, and writes in place to destination
+ *
+ * @param op The binary operator used to combine the bitmasks
+ * @param dest_mask Destination to which the merged result is written
+ * @param masks The list of data pointers of the bitmasks to be merged
+ * @param masks_begin_bits The bit offsets from which each mask is to be merged
+ * @param mask_size_bits The number of bits to be ANDed in each mask
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned device_buffer
+ * @return rmm::device_buffer Output bitmask
+ */
+template <typename Binop>
+void inplace_bitmask_binop(
+  Binop op,
+  device_span<bitmask_type> dest_mask,
+  host_span<bitmask_type const *> masks,
+  host_span<size_type const> masks_begin_bits,
+  size_type mask_size_bits,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+{
+  CUDF_EXPECTS(
+    std::all_of(masks_begin_bits.begin(), masks_begin_bits.end(), [](auto b) { return b >= 0; }),
+    "Invalid range.");
+  CUDF_EXPECTS(mask_size_bits > 0, "Invalid bit range.");
+  CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
+               "Mask pointer cannot be null");
+
+  rmm::device_uvector<bitmask_type const *> d_masks(masks.size(), stream, mr);
+  rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
+
+  CUDA_TRY(cudaMemcpyAsync(
+    d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyHostToDevice, stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
+                           masks_begin_bits.data(),
+                           masks_begin_bits.size_bytes(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+
+  cudf::detail::grid_1d config(dest_mask.size(), 256);
+  offset_bitmask_binop<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+    op, dest_mask, d_masks, d_begin_bits, mask_size_bits);
+  CHECK_CUDA(stream.value());
+  stream.synchronize();
+}
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -88,15 +89,15 @@ rmm::device_buffer copy_bitmask(
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc bitmask_and(std::vector<bitmask_type const*>, std::vector<size_type> const&, size_type,
- * rmm::mr::device_memory_resource *)
+ * @copydoc bitmask_and(host_span<bitmask_type const *> const, host_span<size_type> const,
+ * size_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 rmm::device_buffer bitmask_and(
-  std::vector<bitmask_type const *> const &masks,
-  std::vector<size_type> const &begin_bits,
-  size_type mask_size,
+  host_span<bitmask_type const *> masks,
+  host_span<size_type const> masks_begin_bits,
+  size_type mask_size_bits,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
@@ -110,23 +111,33 @@ rmm::device_buffer bitmask_and(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::bitmask_or
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+rmm::device_buffer bitmask_or(
+  table_view const &view,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs a bitwise AND of the specified bitmasks,
  *        and writes in place to destination
  *
  * @param dest_mask Destination to which the AND result is written
  * @param masks The list of data pointers of the bitmasks to be ANDed
- * @param begin_bits The bit offsets from which each mask is to be ANDed
- * @param mask_size The number of bits to be ANDed in each mask
+ * @param masks_begin_bits The bit offsets from which each mask is to be ANDed
+ * @param mask_size_bits The number of bits to be ANDed in each mask
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return rmm::device_buffer Output bitmask
  */
 void inplace_bitmask_and(
-  bitmask_type *dest_mask,
-  std::vector<bitmask_type const *> const &masks,
-  std::vector<size_type> const &begin_bits,
-  size_type mask_size,
+  device_span<bitmask_type> dest_mask,
+  host_span<bitmask_type const *> masks,
+  host_span<size_type const> masks_begin_bits,
+  size_type mask_size_bits,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 

diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -220,5 +220,19 @@ rmm::device_buffer bitmask_and(
   table_view const& view,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a bitwise OR of the bitmasks of columns of a table
+ *
+ * If any of the columns isn't nullable, it is considered all valid.
+ * If no column in the table is nullable, an empty bitmask is returned.
+ *
+ * @param view The table of columns
+ * @param mr Device memory resource used to allocate the returned device_buffer
+ * @return rmm::device_buffer Output bitmask
+ */
+rmm::device_buffer bitmask_or(
+  table_view const& view,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf