From d3382e0639406e600097957bcec3bf23ea6949fe Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Wed, 17 Feb 2021 17:39:29 -0800 Subject: [PATCH] bitmask_or implementation with bitmask refactor --- .../cudf/column/column_device_view.cuh | 2 +- cpp/include/cudf/detail/null_mask.hpp | 50 +++++++ cpp/include/cudf/null_mask.hpp | 14 ++ cpp/src/bitmask/null_mask.cu | 136 +++++++++++++----- .../main/java/ai/rapids/cudf/ColumnView.java | 2 +- java/src/main/native/src/ColumnViewJni.cpp | 12 +- .../java/ai/rapids/cudf/ColumnVectorTest.java | 21 ++- 7 files changed, 199 insertions(+), 38 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 9f5c2f33aa6..23c8557f3bc 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -805,7 +805,7 @@ namespace detail { * @brief Convenience function to get offset word from a bitmask * * @see copy_offset_bitmask - * @see offset_bitmask_and + * @see offset_bitmask_binop */ __device__ inline bitmask_type get_mask_offset_word(bitmask_type const* __restrict__ source, size_type destination_word_index, diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index 2f2bc91cb74..71009a32fcc 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -87,6 +87,21 @@ rmm::device_buffer copy_bitmask( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc bitmask_binop(Binop op, std::vector, std::vector const&, + * size_type, rmm::mr::device_memory_resource *) + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +template +rmm::device_buffer bitmask_binop( + Binop op, + std::vector const &masks, + std::vector const &begin_bits, + size_type mask_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + /** * @copydoc bitmask_and(std::vector, std::vector const&, size_type, * rmm::mr::device_memory_resource *) @@ -100,6 +115,18 @@ rmm::device_buffer bitmask_and( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::bitmask_binop + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +template +rmm::device_buffer bitmask_binop( + Binop op, + table_view const &view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + /** * @copydoc cudf::bitmask_and * @@ -110,6 +137,29 @@ rmm::device_buffer bitmask_and( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +/** + * @brief Performs a merger of the specified bitmasks using the binary operator + * provided, and writes in place to destination + * + * @param op The binary operator used to combine the bitmasks + * @param dest_mask Destination to which the AND result is written + * @param masks The list of data pointers of the bitmasks to be ANDed + * @param begin_bits The bit offsets from which each mask is to be ANDed + * @param mask_size The number of bits to be ANDed in each mask + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned device_buffer + * @return rmm::device_buffer Output bitmask + */ +template +void inplace_bitmask_binop( + Binop op, + bitmask_type *dest_mask, + std::vector const &masks, + std::vector const &begin_bits, + size_type mask_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a bitwise AND of the specified bitmasks, * and writes in place to destination diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index 5e1f0f0802e..663c42cec17 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -220,5 +220,19 @@ rmm::device_buffer bitmask_and( table_view const& view, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a bitwise OR of the bitmasks of columns of a table + * + * If any of the columns isn't nullable, it is considered all valid. + * If no column in the table is nullable, an empty bitmask is returned. + * + * @param view The table of columns + * @param mr Device memory resource used to allocate the returned device_buffer + * @return rmm::device_buffer Output bitmask + */ +rmm::device_buffer bitmask_or( + table_view const& view, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace cudf diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 8cdcefe9796..c0e0e27f3a6 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -316,8 +316,9 @@ __global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination, } /** - * @brief Computes the bitwise AND of an array of bitmasks + * @brief Computes the merger of an array of bitmasks using a binary operator * + * @param op The binary operator used to combine the bitmasks * @param destination The bitmask to write result into * @param source Array of source mask pointers. All masks must be of same size * @param begin_bit Array of offsets into corresponding @p source masks. @@ -326,20 +327,25 @@ __global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination, * @param source_size Number of bits in each mask in @p source * @param number_of_mask_words The number of words of type bitmask_type to copy */ -__global__ void offset_bitmask_and(bitmask_type *__restrict__ destination, - bitmask_type const *const *__restrict__ source, - size_type const *__restrict__ begin_bit, - size_type num_sources, - size_type source_size, - size_type number_of_mask_words) +template +__global__ void offset_bitmask_binop(Binop op, + bitmask_type *__restrict__ destination, + bitmask_type const *const *__restrict__ source, + size_type const *__restrict__ begin_bit, + size_type num_sources, + size_type source_size, + size_type number_of_mask_words) { for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x; destination_word_index < number_of_mask_words; destination_word_index += blockDim.x * gridDim.x) { - bitmask_type destination_word = ~bitmask_type{0}; // All bits 1 - for (size_type i = 0; i < num_sources; i++) { - destination_word &= detail::get_mask_offset_word( - source[i], destination_word_index, begin_bit[i], begin_bit[i] + source_size); + bitmask_type destination_word = detail::get_mask_offset_word( + source[0], destination_word_index, begin_bit[0], begin_bit[0] + source_size); + for (size_type i = 1; i < num_sources; i++) { + destination_word = + op(destination_word, + detail::get_mask_offset_word( + source[i], destination_word_index, begin_bit[i], begin_bit[i] + source_size)); } destination[destination_word_index] = destination_word; @@ -420,13 +426,14 @@ rmm::device_buffer copy_bitmask(column_view const &view, return null_mask; } -// Inplace Bitwise AND of the masks -void inplace_bitmask_and(bitmask_type *dest_mask, - std::vector const &masks, - std::vector const &begin_bits, - size_type mask_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +template +void inplace_bitmask_binop(Binop op, + bitmask_type *dest_mask, + std::vector const &masks, + std::vector const &begin_bits, + size_type mask_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { CUDF_EXPECTS(std::all_of(begin_bits.begin(), begin_bits.end(), [](auto b) { return b >= 0; }), "Invalid range."); @@ -440,7 +447,8 @@ void inplace_bitmask_and(bitmask_type *dest_mask, rmm::device_vector d_begin_bits(begin_bits); cudf::detail::grid_1d config(number_of_mask_words, 256); - offset_bitmask_and<<>>( + offset_bitmask_binop<<>>( + op, dest_mask, d_masks.data().get(), d_begin_bits.data().get(), @@ -451,23 +459,58 @@ void inplace_bitmask_and(bitmask_type *dest_mask, CHECK_CUDA(stream.value()); } -// Bitwise AND of the masks -rmm::device_buffer bitmask_and(std::vector const &masks, - std::vector const &begin_bits, - size_type mask_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +// Inplace Bitwise AND of the masks +void inplace_bitmask_and(bitmask_type *dest_mask, + std::vector const &masks, + std::vector const &begin_bits, + size_type mask_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + inplace_bitmask_binop( + [] __device__(bitmask_type left, bitmask_type right) { return left & right; }, + dest_mask, + masks, + begin_bits, + mask_size, + stream, + mr); +} + +template +rmm::device_buffer bitmask_binop(Binop op, + std::vector const &masks, + std::vector const &begin_bits, + size_type mask_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { rmm::device_buffer dest_mask{}; auto num_bytes = bitmask_allocation_size_bytes(mask_size); dest_mask = rmm::device_buffer{num_bytes, stream, mr}; - inplace_bitmask_and( - static_cast(dest_mask.data()), masks, begin_bits, mask_size, stream, mr); + inplace_bitmask_binop( + op, static_cast(dest_mask.data()), masks, begin_bits, mask_size, stream, mr); return dest_mask; } +// Bitwise AND of the masks +rmm::device_buffer bitmask_and(std::vector const &masks, + std::vector const &begin_bits, + size_type mask_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + return bitmask_binop( + [] __device__(bitmask_type left, bitmask_type right) { return left & right; }, + masks, + begin_bits, + mask_size, + stream, + mr); +} + cudf::size_type count_set_bits(bitmask_type const *bitmask, size_type start, size_type stop, @@ -631,10 +674,12 @@ std::vector segmented_count_unset_bits(bitmask_type const *bitmask, return ret; } -// Returns the bitwise AND of the null masks of all columns in the table view -rmm::device_buffer bitmask_and(table_view const &view, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +// Returns the merged null masks of all columns in the table view +template +rmm::device_buffer bitmask_binop(Binop op, + table_view const &view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { CUDF_FUNC_RANGE(); rmm::device_buffer null_mask{0, stream, mr}; @@ -650,12 +695,34 @@ rmm::device_buffer bitmask_and(table_view const &view, } if (masks.size() > 0) { - return cudf::detail::bitmask_and(masks, offsets, view.num_rows(), stream, mr); + return cudf::detail::bitmask_binop(op, masks, offsets, view.num_rows(), stream, mr); } return null_mask; } +// Returns the bitwise AND of the null masks of all columns in the table view +rmm::device_buffer bitmask_and(table_view const &view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + return bitmask_binop( + [] __device__(bitmask_type left, bitmask_type right) { return left & right; }, + view, + stream, + mr); +} + +rmm::device_buffer bitmask_or(table_view const &view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + return bitmask_binop( + [] __device__(bitmask_type left, bitmask_type right) { return left | right; }, + view, + stream, + mr); +} } // namespace detail // Count non-zero bits in the specified range @@ -708,4 +775,9 @@ rmm::device_buffer bitmask_and(table_view const &view, rmm::mr::device_memory_re return detail::bitmask_and(view, rmm::cuda_stream_default, mr); } +rmm::device_buffer bitmask_or(table_view const &view, rmm::mr::device_memory_resource *mr) +{ + return detail::bitmask_or(view, rmm::cuda_stream_default, mr); +} + } // namespace cudf diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 1dce52f7105..03bc3654110 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -520,7 +520,7 @@ public final ColumnVector normalizeNANsAndZeros() { * @return the new ColumnVector with merged null mask. */ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... columns) { - assert mergeOp == BinaryOp.BITWISE_AND : "Only BITWISE_AND supported right now"; + assert mergeOp == BinaryOp.BITWISE_AND || mergeOp == BinaryOp.BITWISE_OR : "Only BITWISE_AND and BITWISE_OR supported right now"; long[] columnViews = new long[columns.length]; long size = getRowCount(); diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 82e71b04a2f..1cdd5ec29ef 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1305,10 +1305,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit cudf::table_view *input_table = new cudf::table_view(column_views); cudf::binary_operator op = static_cast(bin_op); - if(op == cudf::binary_operator::BITWISE_AND) { - copy->set_null_mask(cudf::bitmask_and(*input_table)); + switch(op) { + case cudf::binary_operator::BITWISE_AND: + copy->set_null_mask(cudf::bitmask_and(*input_table)); + break; + case cudf::binary_operator::BITWISE_OR: + copy->set_null_mask(cudf::bitmask_or(*input_table)); + break; + default: + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Unsupported merge operation", 0); } - return reinterpret_cast(copy.release()); } CATCH_STD(env, 0); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index cb1f792b99e..d83d5f97a7e 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -571,7 +571,7 @@ void testSpark32BitMurmur3HashMixed() { } @Test - void testNullReconfigureNulls() { + void testAndNullReconfigureNulls() { try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null); ColumnVector v1 = ColumnVector.fromBoxedInts(0, 100, 1, 2, Integer.MIN_VALUE, null); ColumnVector intResult = v1.mergeAndSetValidity(BinaryOp.BITWISE_AND, v0); @@ -585,6 +585,25 @@ void testNullReconfigureNulls() { } } + @Test + void testOrNullReconfigureNulls() { + try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null); + ColumnVector v1 = ColumnVector.fromBoxedInts(0, 100, 1, 2, Integer.MIN_VALUE, null); + ColumnVector intResultV0 = v1.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0); + ColumnVector intResultV0V1 = v1.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0, v1); + ColumnVector intResultMulti = v1.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0, v0, v1, v1, v0, v1, v0); + ColumnVector v2 = ColumnVector.fromStrings("0", "100", "1", "2", "MIN_VALUE", "3"); + ColumnVector stringResult = v2.mergeAndSetValidity(BinaryOp.BITWISE_OR, v0, v1); + ColumnVector stringExpected = ColumnVector.fromStrings("0", "100", "1", "2", "MIN_VALUE", null); + ColumnVector noMaskResult = v2.mergeAndSetValidity(BinaryOp.BITWISE_OR)) { + assertColumnsAreEqual(v0, intResultV0); + assertColumnsAreEqual(v1, intResultV0V1); + assertColumnsAreEqual(v1, intResultMulti); + assertColumnsAreEqual(stringExpected, stringResult); + assertColumnsAreEqual(v2, noMaskResult); + } + } + @Test void isNotNullTestEmptyColumn() { try (ColumnVector v = ColumnVector.fromBoxedInts();