Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change cudf::detail::concatenate_masks to return null-count #13330

Merged
merged 6 commits into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ outputs:
- test -f $PREFIX/include/cudf/detail/binaryop.hpp
- test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh
- test -f $PREFIX/include/cudf/detail/concatenate.hpp
- test -f $PREFIX/include/cudf/detail/concatenate_masks.hpp
- test -f $PREFIX/include/cudf/detail/contiguous_split.hpp
- test -f $PREFIX/include/cudf/detail/copy.hpp
- test -f $PREFIX/include/cudf/detail/datetime.hpp
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,14 +17,11 @@

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_view.hpp>
#include <cudf/concatenate.hpp>
#include <cudf/detail/concatenate.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <vector>
#include <rmm/device_buffer.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

namespace cudf {
//! Inner interfaces and implementations
Expand All @@ -39,12 +36,13 @@ namespace detail {
* @param dest_mask The output buffer to copy null masks into
* @param output_size The total number of null masks bits that are being copied
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return The number of nulls
*/
void concatenate_masks(device_span<column_device_view const> d_views,
device_span<size_t const> d_offsets,
bitmask_type* dest_mask,
size_type output_size,
rmm::cuda_stream_view stream);
size_type concatenate_masks(device_span<column_device_view const> d_views,
device_span<size_t const> d_offsets,
bitmask_type* dest_mask,
size_type output_size,
rmm::cuda_stream_view stream);

/**
* @brief Concatenates `views[i]`'s bitmask from the bits
Expand All @@ -54,10 +52,11 @@ void concatenate_masks(device_span<column_device_view const> d_views,
* @param views Column views whose bitmasks will be concatenated
* @param dest_mask The output buffer to copy null masks into
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return The number of nulls
*/
void concatenate_masks(host_span<column_view const> views,
bitmask_type* dest_mask,
rmm::cuda_stream_view stream);
size_type concatenate_masks(host_span<column_view const> views,
bitmask_type* dest_mask,
rmm::cuda_stream_view stream);

/**
* @copydoc cudf::concatenate_masks(host_span<column_view const>, rmm::mr::device_memory_resource*)
Expand Down
65 changes: 42 additions & 23 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cudf/detail/concatenate.cuh>

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/concatenate_masks.hpp>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
Expand Down Expand Up @@ -49,6 +49,7 @@

namespace cudf {
namespace detail {
namespace {

// From benchmark data, the fused kernel optimization appears to perform better
// when there are more than a trivial number of columns, or when the null mask
Expand Down Expand Up @@ -100,23 +101,29 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
* @brief Concatenates the null mask bits of all the column device views in the
* `views` array to the destination bitmask.
*
* @tparam block_size Block size for using with single_lane_block_sum_reduce
*
* @param views Array of column_device_view
* @param output_offsets Prefix sum of sizes of elements of `views`
* @param number_of_views Size of `views` array
* @param dest_mask The output buffer to copy null masks into
* @param number_of_mask_bits The total number of null masks bits that are being
* copied
* @param number_of_mask_bits The total number of null masks bits that are being copied
* @param out_valid_count To hold the total number of valid bits set
*/
template <size_type block_size>
__global__ void concatenate_masks_kernel(column_device_view const* views,
size_t const* output_offsets,
size_type number_of_views,
bitmask_type* dest_mask,
size_type number_of_mask_bits)
size_type number_of_mask_bits,
size_type* out_valid_count)
{
size_type mask_index = threadIdx.x + blockIdx.x * blockDim.x;

auto active_mask = __ballot_sync(0xFFFF'FFFFu, mask_index < number_of_mask_bits);

size_type warp_valid_count = 0;

while (mask_index < number_of_mask_bits) {
size_type const source_view_index =
thrust::upper_bound(
Expand All @@ -129,42 +136,55 @@ __global__ void concatenate_masks_kernel(column_device_view const* views,
}
bitmask_type const new_word = __ballot_sync(active_mask, bit_is_set);

if (threadIdx.x % detail::warp_size == 0) { dest_mask[word_index(mask_index)] = new_word; }
if (threadIdx.x % detail::warp_size == 0) {
dest_mask[word_index(mask_index)] = new_word;
warp_valid_count += __popc(new_word);
}

mask_index += blockDim.x * gridDim.x;
active_mask = __ballot_sync(active_mask, mask_index < number_of_mask_bits);
}

using detail::single_lane_block_sum_reduce;
auto const block_valid_count = single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); }
}
} // namespace

void concatenate_masks(device_span<column_device_view const> d_views,
device_span<size_t const> d_offsets,
bitmask_type* dest_mask,
size_type output_size,
rmm::cuda_stream_view stream)
size_type concatenate_masks(device_span<column_device_view const> d_views,
device_span<size_t const> d_offsets,
bitmask_type* dest_mask,
size_type output_size,
rmm::cuda_stream_view stream)
{
rmm::device_scalar<size_type> d_valid_count(0, stream);
constexpr size_type block_size{256};
cudf::detail::grid_1d config(output_size, block_size);
concatenate_masks_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
d_views.data(),
d_offsets.data(),
static_cast<size_type>(d_views.size()),
dest_mask,
output_size);
concatenate_masks_kernel<block_size>
<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
d_views.data(),
d_offsets.data(),
static_cast<size_type>(d_views.size()),
dest_mask,
output_size,
d_valid_count.data());
return output_size - d_valid_count.value(stream);
}

void concatenate_masks(host_span<column_view const> views,
bitmask_type* dest_mask,
rmm::cuda_stream_view stream)
size_type concatenate_masks(host_span<column_view const> views,
bitmask_type* dest_mask,
rmm::cuda_stream_view stream)
{
// Preprocess and upload inputs to device memory
auto const device_views = create_device_views(views, stream);
auto const& d_views = std::get<1>(device_views);
auto const& d_offsets = std::get<2>(device_views);
auto const output_size = std::get<3>(device_views);

concatenate_masks(d_views, d_offsets, dest_mask, output_size, stream);
return concatenate_masks(d_views, d_offsets, dest_mask, output_size, stream);
}

namespace {
template <typename T, size_type block_size, bool Nullable>
__global__ void fused_concatenate_kernel(column_device_view const* input_views,
size_t const* input_offsets,
Expand Down Expand Up @@ -287,7 +307,8 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,

// If concatenated column is nullable, proceed to calculate it
if (has_nulls) {
cudf::detail::concatenate_masks(views, (col->mutable_view()).null_mask(), stream);
col->set_null_count(
cudf::detail::concatenate_masks(views, (col->mutable_view()).null_mask(), stream));
} else {
col->set_null_count(0); // prevent null count from being materialized
}
Expand Down Expand Up @@ -340,8 +361,6 @@ std::unique_ptr<column> concatenate_dispatch::operator()<cudf::struct_view>()
return cudf::structs::detail::concatenate(views, stream, mr);
}

namespace {

void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream);

/**
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dictionary/add_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

#include <cudf/detail/concatenate.cuh>
#include <cudf/detail/concatenate.hpp>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dictionary/detail/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/concatenate.cuh>
#include <cudf/detail/concatenate.hpp>
#include <cudf/detail/indexalator.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/sorting.hpp>
Expand Down
1 change: 0 additions & 1 deletion cpp/src/join/hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#include "join_common_utils.cuh"

#include <cudf/copying.hpp>
#include <cudf/detail/concatenate.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/join.hpp>
#include <cudf/detail/structs/utilities.hpp>
Expand Down
15 changes: 6 additions & 9 deletions cpp/src/lists/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/concatenate.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/concatenate.cuh>
#include <cudf/detail/concatenate.hpp>
#include <cudf/detail/concatenate_masks.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.cuh>
#include <cudf/lists/lists_column_view.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand Down Expand Up @@ -124,14 +126,9 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
rmm::device_buffer null_mask = create_null_mask(
total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
cudf::size_type null_count{0};
if (has_nulls) {
cudf::detail::concatenate_masks(columns, static_cast<bitmask_type*>(null_mask.data()), stream);
null_count =
std::transform_reduce(columns.begin(), columns.end(), 0, std::plus{}, [](auto const& col) {
return col.null_count();
});
}
auto null_mask_data = static_cast<bitmask_type*>(null_mask.data());
auto const null_count =
has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0};

// assemble into outgoing list column
return make_lists_column(total_list_count,
Expand Down
14 changes: 5 additions & 9 deletions cpp/src/structs/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/concatenate.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/concatenate.cuh>
#include <cudf/detail/concatenate.hpp>
#include <cudf/detail/concatenate_masks.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/structs/utilities.hpp>
#include <cudf/structs/structs_column_view.hpp>
Expand Down Expand Up @@ -65,14 +66,9 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
rmm::device_buffer null_mask =
create_null_mask(total_length, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
cudf::size_type null_count{0};
if (has_nulls) {
cudf::detail::concatenate_masks(columns, static_cast<bitmask_type*>(null_mask.data()), stream);
null_count =
std::transform_reduce(columns.begin(), columns.end(), 0, std::plus{}, [](auto const& col) {
return col.null_count();
});
}
auto null_mask_data = static_cast<bitmask_type*>(null_mask.data());
auto const null_count =
has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0};

// assemble into outgoing list column
return make_structs_column(
Expand Down