diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 275b8f9332f..a4e73c86e0e 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -107,6 +107,7 @@ outputs: - test -f $PREFIX/include/cudf/detail/binaryop.hpp - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh - test -f $PREFIX/include/cudf/detail/concatenate.hpp + - test -f $PREFIX/include/cudf/detail/concatenate_masks.hpp - test -f $PREFIX/include/cudf/detail/contiguous_split.hpp - test -f $PREFIX/include/cudf/detail/copy.hpp - test -f $PREFIX/include/cudf/detail/datetime.hpp diff --git a/cpp/include/cudf/detail/concatenate.cuh b/cpp/include/cudf/detail/concatenate_masks.hpp similarity index 76% rename from cpp/include/cudf/detail/concatenate.cuh rename to cpp/include/cudf/detail/concatenate_masks.hpp index 51bcb1afa1f..e7086ea17a5 100644 --- a/cpp/include/cudf/detail/concatenate.cuh +++ b/cpp/include/cudf/detail/concatenate_masks.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,14 +17,11 @@ #include #include -#include -#include -#include #include #include - -#include +#include +#include namespace cudf { //! Inner interfaces and implementations @@ -39,12 +36,13 @@ namespace detail { * @param dest_mask The output buffer to copy null masks into * @param output_size The total number of null masks bits that are being copied * @param stream CUDA stream used for device memory operations and kernel launches. + * @return The number of nulls */ -void concatenate_masks(device_span d_views, - device_span d_offsets, - bitmask_type* dest_mask, - size_type output_size, - rmm::cuda_stream_view stream); +size_type concatenate_masks(device_span d_views, + device_span d_offsets, + bitmask_type* dest_mask, + size_type output_size, + rmm::cuda_stream_view stream); /** * @brief Concatenates `views[i]`'s bitmask from the bits @@ -54,10 +52,11 @@ void concatenate_masks(device_span d_views, * @param views Column views whose bitmasks will be concatenated * @param dest_mask The output buffer to copy null masks into * @param stream CUDA stream used for device memory operations and kernel launches. + * @return The number of nulls */ -void concatenate_masks(host_span views, - bitmask_type* dest_mask, - rmm::cuda_stream_view stream); +size_type concatenate_masks(host_span views, + bitmask_type* dest_mask, + rmm::cuda_stream_view stream); /** * @copydoc cudf::concatenate_masks(host_span, rmm::mr::device_memory_resource*) diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index c42cc5c69f9..b17475cb877 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -13,10 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include #include +#include #include #include #include @@ -49,6 +49,7 @@ namespace cudf { namespace detail { +namespace { // From benchmark data, the fused kernel optimization appears to perform better // when there are more than a trivial number of columns, or when the null mask @@ -100,23 +101,29 @@ auto create_device_views(host_span views, rmm::cuda_stream_vi * @brief Concatenates the null mask bits of all the column device views in the * `views` array to the destination bitmask. * + * @tparam block_size Block size for using with single_lane_block_sum_reduce + * * @param views Array of column_device_view * @param output_offsets Prefix sum of sizes of elements of `views` * @param number_of_views Size of `views` array * @param dest_mask The output buffer to copy null masks into - * @param number_of_mask_bits The total number of null masks bits that are being - * copied + * @param number_of_mask_bits The total number of null masks bits that are being copied + * @param out_valid_count To hold the total number of valid bits set */ +template __global__ void concatenate_masks_kernel(column_device_view const* views, size_t const* output_offsets, size_type number_of_views, bitmask_type* dest_mask, - size_type number_of_mask_bits) + size_type number_of_mask_bits, + size_type* out_valid_count) { size_type mask_index = threadIdx.x + blockIdx.x * blockDim.x; auto active_mask = __ballot_sync(0xFFFF'FFFFu, mask_index < number_of_mask_bits); + size_type warp_valid_count = 0; + while (mask_index < number_of_mask_bits) { size_type const source_view_index = thrust::upper_bound( @@ -129,32 +136,44 @@ __global__ void concatenate_masks_kernel(column_device_view const* views, } bitmask_type const new_word = __ballot_sync(active_mask, bit_is_set); - if (threadIdx.x % detail::warp_size == 0) { dest_mask[word_index(mask_index)] = new_word; } + if (threadIdx.x % detail::warp_size == 0) { + dest_mask[word_index(mask_index)] = new_word; + warp_valid_count += __popc(new_word); + } mask_index += blockDim.x * gridDim.x; active_mask = __ballot_sync(active_mask, mask_index < number_of_mask_bits); } + + using detail::single_lane_block_sum_reduce; + auto const block_valid_count = single_lane_block_sum_reduce(warp_valid_count); + if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); } } +} // namespace -void concatenate_masks(device_span d_views, - device_span d_offsets, - bitmask_type* dest_mask, - size_type output_size, - rmm::cuda_stream_view stream) +size_type concatenate_masks(device_span d_views, + device_span d_offsets, + bitmask_type* dest_mask, + size_type output_size, + rmm::cuda_stream_view stream) { + rmm::device_scalar d_valid_count(0, stream); constexpr size_type block_size{256}; cudf::detail::grid_1d config(output_size, block_size); - concatenate_masks_kernel<<>>( - d_views.data(), - d_offsets.data(), - static_cast(d_views.size()), - dest_mask, - output_size); + concatenate_masks_kernel + <<>>( + d_views.data(), + d_offsets.data(), + static_cast(d_views.size()), + dest_mask, + output_size, + d_valid_count.data()); + return output_size - d_valid_count.value(stream); } -void concatenate_masks(host_span views, - bitmask_type* dest_mask, - rmm::cuda_stream_view stream) +size_type concatenate_masks(host_span views, + bitmask_type* dest_mask, + rmm::cuda_stream_view stream) { // Preprocess and upload inputs to device memory auto const device_views = create_device_views(views, stream); @@ -162,9 +181,10 @@ void concatenate_masks(host_span views, auto const& d_offsets = std::get<2>(device_views); auto const output_size = std::get<3>(device_views); - concatenate_masks(d_views, d_offsets, dest_mask, output_size, stream); + return concatenate_masks(d_views, d_offsets, dest_mask, output_size, stream); } +namespace { template __global__ void fused_concatenate_kernel(column_device_view const* input_views, size_t const* input_offsets, @@ -287,7 +307,8 @@ std::unique_ptr for_each_concatenate(host_span views, // If concatenated column is nullable, proceed to calculate it if (has_nulls) { - cudf::detail::concatenate_masks(views, (col->mutable_view()).null_mask(), stream); + col->set_null_count( + cudf::detail::concatenate_masks(views, (col->mutable_view()).null_mask(), stream)); } else { col->set_null_count(0); // prevent null count from being materialized } @@ -340,8 +361,6 @@ std::unique_ptr concatenate_dispatch::operator()() return cudf::structs::detail::concatenate(views, stream, mr); } -namespace { - void bounds_and_type_check(host_span cols, rmm::cuda_stream_view stream); /** diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index d543225d3eb..ab22c07e4d5 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include #include diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index 98ad108655f..70b6563b1bc 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include #include diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 33f44dbf8f5..15c1d21e74c 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -16,7 +16,6 @@ #include "join_common_utils.cuh" #include -#include #include #include #include diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu index 69d6949ad4a..485903deaec 100644 --- a/cpp/src/lists/copying/concatenate.cu +++ b/cpp/src/lists/copying/concatenate.cu @@ -19,8 +19,10 @@ #include #include #include -#include +#include +#include #include +#include #include #include @@ -124,14 +126,9 @@ std::unique_ptr concatenate(host_span columns, std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); }); rmm::device_buffer null_mask = create_null_mask( total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED); - cudf::size_type null_count{0}; - if (has_nulls) { - cudf::detail::concatenate_masks(columns, static_cast(null_mask.data()), stream); - null_count = - std::transform_reduce(columns.begin(), columns.end(), 0, std::plus{}, [](auto const& col) { - return col.null_count(); - }); - } + auto null_mask_data = static_cast(null_mask.data()); + auto const null_count = + has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0}; // assemble into outgoing list column return make_lists_column(total_list_count, diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu index 19552b2dc03..b5ec1c0812a 100644 --- a/cpp/src/structs/copying/concatenate.cu +++ b/cpp/src/structs/copying/concatenate.cu @@ -19,7 +19,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -65,14 +66,9 @@ std::unique_ptr concatenate(host_span columns, std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); }); rmm::device_buffer null_mask = create_null_mask(total_length, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED); - cudf::size_type null_count{0}; - if (has_nulls) { - cudf::detail::concatenate_masks(columns, static_cast(null_mask.data()), stream); - null_count = - std::transform_reduce(columns.begin(), columns.end(), 0, std::plus{}, [](auto const& col) { - return col.null_count(); - }); - } + auto null_mask_data = static_cast(null_mask.data()); + auto const null_count = + has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0}; // assemble into outgoing list column return make_structs_column(