Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bitmask_or implementation with bitmask refactor #7406

Merged
merged 10 commits into from
Mar 8, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/include/cudf/column/column_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,7 @@ namespace detail {
* @brief Convenience function to get offset word from a bitmask
*
* @see copy_offset_bitmask
* @see offset_bitmask_and
* @see offset_bitmask_binop
*/
__device__ inline bitmask_type get_mask_offset_word(bitmask_type const* __restrict__ source,
size_type destination_word_index,
Expand Down
138 changes: 138 additions & 0 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/null_mask.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>

using cudf::detail::device_span;

namespace cudf {
namespace {
rwlee marked this conversation as resolved.
Show resolved Hide resolved
/**
* @brief Computes the merger of an array of bitmasks using a binary operator
*
* @param op The binary operator used to combine the bitmasks
* @param destination The bitmask to write result into
* @param source Array of source mask pointers. All masks must be of same size
* @param begin_bit Array of offsets into corresponding @p source masks.
* Must be same size as source array
* @param num_sources Number of masks in @p source array
* @param source_size Number of bits in each mask in @p source
* @param number_of_mask_words The number of words of type bitmask_type to copy
*/
template <typename Binop>
__global__ void offset_bitmask_binop(Binop op,
device_span<bitmask_type> destination,
device_span<bitmask_type const *> source,
device_span<size_type> begin_bit,
size_type source_size)
rwlee marked this conversation as resolved.
Show resolved Hide resolved
{
for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
destination_word_index < destination.size();
destination_word_index += blockDim.x * gridDim.x) {
bitmask_type destination_word = detail::get_mask_offset_word(
source[0], destination_word_index, begin_bit[0], begin_bit[0] + source_size);
for (size_type i = 1; i < source.size(); i++) {
destination_word =
op(destination_word,
detail::get_mask_offset_word(
source[i], destination_word_index, begin_bit[i], begin_bit[i] + source_size));
}

destination[destination_word_index] = destination_word;
}
}
} // namespace
namespace detail {
/**
* @copydoc bitmask_binop(Binop op, std::vector<bitmask_type const*>, std::vector<size_type> const&,
* size_type, rmm::mr::device_memory_resource *)
*
* @param stream CUDA stream used for device memory operations and kernel launches
*/
template <typename Binop>
rmm::device_buffer bitmask_binop(
Binop op,
std::vector<bitmask_type const *> const &masks,
std::vector<size_type> const &begin_bits,
size_type mask_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
{
CUDF_EXPECTS(std::all_of(begin_bits.begin(), begin_bits.end(), [](auto b) { return b >= 0; }),
"Invalid range.");
CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
"Mask pointer cannot be null");
rmm::device_buffer dest_mask{};
auto num_bytes = bitmask_allocation_size_bytes(mask_size);

rmm::device_vector<bitmask_type const *> d_masks(masks);
rwlee marked this conversation as resolved.
Show resolved Hide resolved
rmm::device_vector<size_type> d_begin_bits(begin_bits);
rwlee marked this conversation as resolved.
Show resolved Hide resolved

dest_mask = rmm::device_buffer{num_bytes, stream, mr};
rwlee marked this conversation as resolved.
Show resolved Hide resolved

inplace_bitmask_binop(op,
device_span<bitmask_type>(static_cast<bitmask_type *>(dest_mask.data()),
num_bitmask_words(mask_size)),
device_span<bitmask_type const *>(d_masks.data().get(), d_masks.size()),
device_span<size_type>(d_begin_bits.data().get(), d_begin_bits.size()),
mask_size,
stream,
mr);
rwlee marked this conversation as resolved.
Show resolved Hide resolved

return dest_mask;
rwlee marked this conversation as resolved.
Show resolved Hide resolved
}

/**
* @brief Performs a merger of the specified bitmasks using the binary operator
* provided, and writes in place to destination
*
* @param op The binary operator used to combine the bitmasks
* @param dest_mask Destination to which the AND result is written
* @param masks The list of data pointers of the bitmasks to be ANDed
* @param begin_bits The bit offsets from which each mask is to be ANDed
* @param mask_size The number of bits to be ANDed in each mask
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned device_buffer
* @return rmm::device_buffer Output bitmask
*/
template <typename Binop>
void inplace_bitmask_binop(
Binop op,
device_span<bitmask_type> dest_mask,
device_span<bitmask_type const *> masks,
device_span<size_type> begin_bits,
size_type mask_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
{
CUDF_EXPECTS(mask_size > 0, "Invalid bit range.");

cudf::detail::grid_1d config(dest_mask.size(), 256);
offset_bitmask_binop<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
op, dest_mask, masks, begin_bits, mask_size);
CHECK_CUDA(stream.value());
}

} // namespace detail

} // namespace cudf
1 change: 1 addition & 0 deletions cpp/include/cudf/detail/null_mask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <cudf/types.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>

Expand Down
14 changes: 14 additions & 0 deletions cpp/include/cudf/null_mask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,5 +220,19 @@ rmm::device_buffer bitmask_and(
table_view const& view,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a bitwise OR of the bitmasks of columns of a table
*
* If any of the columns isn't nullable, it is considered all valid.
* If no column in the table is nullable, an empty bitmask is returned.
*
* @param view The table of columns
* @param mr Device memory resource used to allocate the returned device_buffer
* @return rmm::device_buffer Output bitmask
*/
rmm::device_buffer bitmask_or(
table_view const& view,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace cudf
111 changes: 59 additions & 52 deletions cpp/src/bitmask/null_mask.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/null_mask.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/cuda.cuh>
Expand All @@ -23,6 +24,7 @@
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/bit.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
Expand All @@ -41,6 +43,8 @@
#include <numeric>
#include <type_traits>

using cudf::detail::device_span;

namespace cudf {
size_type state_null_count(mask_state state, size_type size)
{
Expand Down Expand Up @@ -315,37 +319,6 @@ __global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination,
}
}

/**
* @brief Computes the bitwise AND of an array of bitmasks
*
* @param destination The bitmask to write result into
* @param source Array of source mask pointers. All masks must be of same size
* @param begin_bit Array of offsets into corresponding @p source masks.
* Must be same size as source array
* @param num_sources Number of masks in @p source array
* @param source_size Number of bits in each mask in @p source
* @param number_of_mask_words The number of words of type bitmask_type to copy
*/
__global__ void offset_bitmask_and(bitmask_type *__restrict__ destination,
bitmask_type const *const *__restrict__ source,
size_type const *__restrict__ begin_bit,
size_type num_sources,
size_type source_size,
size_type number_of_mask_words)
{
for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
destination_word_index < number_of_mask_words;
destination_word_index += blockDim.x * gridDim.x) {
bitmask_type destination_word = ~bitmask_type{0}; // All bits 1
for (size_type i = 0; i < num_sources; i++) {
destination_word &= detail::get_mask_offset_word(
source[i], destination_word_index, begin_bit[i], begin_bit[i] + source_size);
}

destination[destination_word_index] = destination_word;
}
}

// convert [first_bit_index,last_bit_index) to
// [first_word_index,last_word_index)
struct to_word_index : public thrust::unary_function<size_type, size_type> {
Expand Down Expand Up @@ -430,25 +403,20 @@ void inplace_bitmask_and(bitmask_type *dest_mask,
{
CUDF_EXPECTS(std::all_of(begin_bits.begin(), begin_bits.end(), [](auto b) { return b >= 0; }),
"Invalid range.");
CUDF_EXPECTS(mask_size > 0, "Invalid bit range.");
CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
"Mask pointer cannot be null");

auto number_of_mask_words = num_bitmask_words(mask_size);

rmm::device_vector<bitmask_type const *> d_masks(masks);
rmm::device_vector<size_type> d_begin_bits(begin_bits);

cudf::detail::grid_1d config(number_of_mask_words, 256);
offset_bitmask_and<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
dest_mask,
d_masks.data().get(),
d_begin_bits.data().get(),
d_masks.size(),
inplace_bitmask_binop(
[] __device__(bitmask_type left, bitmask_type right) { return left & right; },
device_span<bitmask_type>(dest_mask, num_bitmask_words(mask_size)),
device_span<bitmask_type const *>(d_masks.data().get(), d_masks.size()),
device_span<size_type>(d_begin_bits.data().get(), d_begin_bits.size()),
rwlee marked this conversation as resolved.
Show resolved Hide resolved
mask_size,
number_of_mask_words);

CHECK_CUDA(stream.value());
stream,
mr);
rwlee marked this conversation as resolved.
Show resolved Hide resolved
}

// Bitwise AND of the masks
Expand All @@ -458,14 +426,13 @@ rmm::device_buffer bitmask_and(std::vector<bitmask_type const *> const &masks,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource *mr)
{
rmm::device_buffer dest_mask{};
auto num_bytes = bitmask_allocation_size_bytes(mask_size);

dest_mask = rmm::device_buffer{num_bytes, stream, mr};
inplace_bitmask_and(
static_cast<bitmask_type *>(dest_mask.data()), masks, begin_bits, mask_size, stream, mr);

return dest_mask;
return bitmask_binop(
[] __device__(bitmask_type left, bitmask_type right) { return left & right; },
masks,
begin_bits,
mask_size,
stream,
mr);
}

cudf::size_type count_set_bits(bitmask_type const *bitmask,
Expand Down Expand Up @@ -650,12 +617,47 @@ rmm::device_buffer bitmask_and(table_view const &view,
}

if (masks.size() > 0) {
return cudf::detail::bitmask_and(masks, offsets, view.num_rows(), stream, mr);
return cudf::detail::bitmask_binop(
[] __device__(bitmask_type left, bitmask_type right) { return left & right; },
masks,
offsets,
view.num_rows(),
stream,
mr);
}

return null_mask;
}

rmm::device_buffer bitmask_or(table_view const &view,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource *mr)
{
CUDF_FUNC_RANGE();
rmm::device_buffer null_mask{0, stream, mr};
if (view.num_rows() == 0 or view.num_columns() == 0) { return null_mask; }

std::vector<bitmask_type const *> masks;
std::vector<size_type> offsets;
for (auto &&col : view) {
if (col.nullable()) {
masks.push_back(col.null_mask());
offsets.push_back(col.offset());
}
}

if (static_cast<size_type>(masks.size()) == view.num_columns()) {
rwlee marked this conversation as resolved.
Show resolved Hide resolved
return cudf::detail::bitmask_binop(
[] __device__(bitmask_type left, bitmask_type right) { return left | right; },
masks,
offsets,
view.num_rows(),
stream,
mr);
}

return null_mask;
}
} // namespace detail

// Count non-zero bits in the specified range
Expand Down Expand Up @@ -708,4 +710,9 @@ rmm::device_buffer bitmask_and(table_view const &view, rmm::mr::device_memory_re
return detail::bitmask_and(view, rmm::cuda_stream_default, mr);
}

rmm::device_buffer bitmask_or(table_view const &view, rmm::mr::device_memory_resource *mr)
{
return detail::bitmask_or(view, rmm::cuda_stream_default, mr);
}

} // namespace cudf
2 changes: 1 addition & 1 deletion java/src/main/java/ai/rapids/cudf/ColumnView.java
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,7 @@ public final ColumnVector normalizeNANsAndZeros() {
* @return the new ColumnVector with merged null mask.
*/
public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... columns) {
assert mergeOp == BinaryOp.BITWISE_AND : "Only BITWISE_AND supported right now";
assert mergeOp == BinaryOp.BITWISE_AND || mergeOp == BinaryOp.BITWISE_OR : "Only BITWISE_AND and BITWISE_OR supported right now";
long[] columnViews = new long[columns.length];
long size = getRowCount();

Expand Down
12 changes: 9 additions & 3 deletions java/src/main/native/src/ColumnViewJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1305,10 +1305,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
cudf::table_view *input_table = new cudf::table_view(column_views);

cudf::binary_operator op = static_cast<cudf::binary_operator>(bin_op);
if(op == cudf::binary_operator::BITWISE_AND) {
copy->set_null_mask(cudf::bitmask_and(*input_table));
switch(op) {
case cudf::binary_operator::BITWISE_AND:
copy->set_null_mask(cudf::bitmask_and(*input_table));
break;
case cudf::binary_operator::BITWISE_OR:
copy->set_null_mask(cudf::bitmask_or(*input_table));
break;
default:
JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Unsupported merge operation", 0);
rwlee marked this conversation as resolved.
Show resolved Hide resolved
}

return reinterpret_cast<jlong>(copy.release());
}
CATCH_STD(env, 0);
Expand Down
Loading