From 56d83e2be92d05006e2eb50703da60b887c40998 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 14 Jul 2021 11:06:29 -0500 Subject: [PATCH] Refactor detail implementations of segmented_count_unset_bits() and segmented_count_set_bits() to take iterators for range indices. --- cpp/CMakeLists.txt | 2 +- cpp/include/cudf/detail/null_mask.cuh | 280 ++++++++++++++++++++++++ cpp/src/bitmask/null_mask.cu | 238 ++------------------ cpp/src/copying/{slice.cpp => slice.cu} | 23 +- 4 files changed, 311 insertions(+), 232 deletions(-) rename cpp/src/copying/{slice.cpp => slice.cu} (82%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3582f29bf11..b561789a494 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -174,7 +174,7 @@ add_library(cudf src/copying/sample.cu src/copying/scatter.cu src/copying/shift.cu - src/copying/slice.cpp + src/copying/slice.cu src/copying/split.cpp src/copying/segmented_shift.cu src/datetime/datetime_ops.cu diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index e507bacb919..875873f026b 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -141,6 +142,285 @@ void inplace_bitmask_binop( stream.synchronize(); } +namespace { + +/** + * For each range `[first_bit_indices[i], last_bit_indices[i])` + * (where 0 <= i < `num_ranges`), count the number of bits set outside the range + * in the boundary words (i.e. words that include either + * `first_bit_indices[i]'th` bit or `(last_bit_indices[i] - 1)'th` bit) and + * subtract the count from the range's null count. + * + * Expects `0 <= first_bit_indices[i] <= last_bit_indices[i]`. + * + * @param[in] bitmask The bitmask whose non-zero bits outside the range in the + * boundary words will be counted. + * @param[in] num_ranges The number of ranges + * @param[in] first_bit_indices The indices (inclusive) of the first bit in each + * range + * @param[in] last_bit_indices The indices (exclusive) of the last bit in each + * range + * @param[in,out] null_counts The number of non-zero bits in each range to be + * updated + */ +template +__global__ void subtract_set_bits_range_boundaries_kerenel(bitmask_type const* bitmask, + size_type num_ranges, + OffsetIterator first_bit_indices, + OffsetIterator last_bit_indices, + OutputIterator null_counts) +{ + constexpr size_type const word_size_in_bits{detail::size_in_bits()}; + + cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x; + cudf::size_type range_id = tid; + + while (range_id < num_ranges) { + size_type const first_bit_index = *(first_bit_indices + range_id); + size_type const last_bit_index = *(last_bit_indices + range_id); + size_type delta = 0; + size_type num_slack_bits = 0; + + // compute delta due to the preceding bits in the first word in the range + + num_slack_bits = intra_word_index(first_bit_index); + if (num_slack_bits > 0) { + bitmask_type word = bitmask[word_index(first_bit_index)]; + bitmask_type slack_mask = set_least_significant_bits(num_slack_bits); + delta -= __popc(word & slack_mask); + } + + // compute delta due to the following bits in the last word in the range + + num_slack_bits = (last_bit_index % word_size_in_bits) == 0 + ? 0 + : word_size_in_bits - intra_word_index(last_bit_index); + if (num_slack_bits > 0) { + bitmask_type word = bitmask[word_index(last_bit_index)]; + bitmask_type slack_mask = set_most_significant_bits(num_slack_bits); + delta -= __popc(word & slack_mask); + } + + size_type updated_null_count = *(null_counts + range_id) + delta; + *(null_counts + range_id) = updated_null_count; + + range_id += blockDim.x * gridDim.x; + } +} + +// convert [first_bit_index,last_bit_index) to +// [first_word_index,last_word_index) +struct to_word_index : public thrust::unary_function { + const bool _inclusive = false; + size_type const* const _d_bit_indices = nullptr; + + /** + * @brief Constructor of a functor that converts bit indices to bitmask word + * indices. + * + * @param[in] inclusive Flag that indicates whether bit indices are inclusive + * or exclusive. + * @param[in] d_bit_indices Pointer to an array of bit indices + */ + __host__ to_word_index(bool inclusive, size_type const* d_bit_indices) + : _inclusive(inclusive), _d_bit_indices(d_bit_indices) + { + } + + __device__ size_type operator()(const size_type& i) const + { + auto bit_index = _d_bit_indices[i]; + return word_index(bit_index) + ((_inclusive || intra_word_index(bit_index) == 0) ? 0 : 1); + } +}; + +/** + * @brief Functor that returns the number of set bits for a specified word + * of a bitmask array. + * + */ +struct word_num_set_bits_functor { + word_num_set_bits_functor(bitmask_type const* bitmask_) : bitmask(bitmask_) {} + __device__ size_type operator()(size_type i) const + { + return static_cast(__popc(bitmask[i])); + } + bitmask_type const* bitmask; +}; + +} // anonymous namespace + +/** + * @brief Given a bitmask, counts the number of set (1) bits in every range + * `[indices_begin[2*i], indices_begin[(2*i)+1])` (where 0 <= i < std::distance(indices_begin, + * indices_end) / 2). + * + * Returns an empty vector if `bitmask == nullptr`. + * + * @throws cudf::logic_error if `std::distance(indices_begin, indices_end) % 2 != 0` + * @throws cudf::logic_error if `indices_begin[2*i] < 0 or indices_begin[2*i] > + * indices_begin[(2*i)+1]` + * + * @param bitmask Bitmask residing in device memory whose bits will be counted + * @param indices_begin An iterator representing the beginning of the range of indices specifying + * ranges to count the number of set bits within + * @param indices_end An iterator representing the end of the range of indices specifying ranges to + * count the number of set bits within + * @param streaam CUDA stream used for device memory operations and kernel launches + * + * @return A vector storing the number of non-zero bits in the specified ranges + */ +template +std::vector segmented_count_set_bits(bitmask_type const* bitmask, + IndexIterator indices_begin, + IndexIterator indices_end, + rmm::cuda_stream_view stream) +{ + size_t const num_indices = std::distance(indices_begin, indices_end); + + CUDF_EXPECTS(num_indices % 2 == 0, "Array of indices needs to have an even number of elements."); + for (size_t i = 0; i < num_indices / 2; i++) { + auto begin = indices_begin[i * 2]; + auto end = indices_begin[i * 2 + 1]; + CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative."); + CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index."); + } + + if (num_indices == 0) { + return std::vector{}; + } else if (bitmask == nullptr) { + std::vector ret(num_indices / 2); + for (size_t i = 0; i < num_indices / 2; i++) { + ret[i] = indices_begin[2 * i + 1] - indices_begin[2 * i]; + } + return ret; + } + + size_type num_ranges = num_indices / 2; + std::vector h_first_indices(num_ranges); + std::vector h_last_indices(num_ranges); + thrust::stable_partition_copy(thrust::seq, + indices_begin, + indices_end, + thrust::make_counting_iterator(0), + h_first_indices.begin(), + h_last_indices.begin(), + [](auto i) { return (i % 2) == 0; }); + + auto d_first_indices = make_device_uvector_async(h_first_indices, stream); + auto d_last_indices = make_device_uvector_async(h_last_indices, stream); + rmm::device_uvector d_null_counts(num_ranges, stream); + + auto word_num_set_bits = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + word_num_set_bits_functor{bitmask}); + auto first_word_indices = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + // We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires + // first_word_indices and last_word_indices to have the same type. + to_word_index(true, d_first_indices.data())); + auto last_word_indices = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + // We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires + // first_word_indices and last_word_indices to have the same type. + to_word_index(false, d_last_indices.data())); + + // first allocate temporary memory + + size_t temp_storage_bytes{0}; + CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr, + temp_storage_bytes, + word_num_set_bits, + d_null_counts.begin(), + num_ranges, + first_word_indices, + last_word_indices, + stream.value())); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + + // second perform segmented reduction + + CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(), + temp_storage_bytes, + word_num_set_bits, + d_null_counts.begin(), + num_ranges, + first_word_indices, + last_word_indices, + stream.value())); + + CHECK_CUDA(stream.value()); + + // third, adjust counts in segment boundaries (if segments are not + // word-aligned) + + constexpr size_type block_size{256}; + + cudf::detail::grid_1d grid(num_ranges, block_size); + + subtract_set_bits_range_boundaries_kerenel<<>>( + bitmask, num_ranges, d_first_indices.begin(), d_last_indices.begin(), d_null_counts.begin()); + + CHECK_CUDA(stream.value()); + + std::vector ret(num_ranges); + CUDA_TRY(cudaMemcpyAsync(ret.data(), + d_null_counts.data(), + num_ranges * sizeof(size_type), + cudaMemcpyDeviceToHost, + stream.value())); + + stream.synchronize(); // now ret is valid. + + return ret; +} + +/** + * @brief Given a bitmask, counts the number of unset (0) bits in every range + * `[indices_begin[2*i], indices_begin[(2*i)+1])` (where 0 <= i < std::distance(indices_begin, + * indices_end) / 2). + * + * Returns an empty vector if `bitmask == nullptr`. + * + * @throws cudf::logic_error if `std::distance(indices_begin, indices_end) % 2 != 0` + * @throws cudf::logic_error if `indices_begin[2*i] < 0 or indices_begin[2*i] > + * indices_begin[(2*i)+1]` + * + * @param bitmask Bitmask residing in device memory whose bits will be counted + * @param indices_begin An iterator representing the beginning of the range of indices specifying + * ranges to count the number of unset bits within + * @param indices_end An iterator representing the end of the range of indices specifying ranges to + * count the number of unset bits within + * @param streaam CUDA stream used for device memory operations and kernel launches + * + * @return A vector storing the number of non-zero bits in the specified ranges + */ +template +std::vector segmented_count_unset_bits(bitmask_type const* bitmask, + IndexIterator indices_begin, + IndexIterator indices_end, + rmm::cuda_stream_view stream) +{ + size_t const num_indices = std::distance(indices_begin, indices_end); + + if (num_indices == 0) { + return std::vector{}; + } else if (bitmask == nullptr) { + return std::vector(num_indices / 2, 0); + } + + auto ret = segmented_count_set_bits(bitmask, indices_begin, indices_end, stream); + for (size_t i = 0; i < ret.size(); i++) { + auto begin = indices_begin[i * 2]; + auto end = indices_begin[i * 2 + 1]; + ret[i] = (end - begin) - ret[i]; + } + + return ret; +} + } // namespace detail } // namespace cudf diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index c3add0ea97e..fe13277ac8e 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -228,70 +228,6 @@ __global__ void count_set_bits_kernel(bitmask_type const* bitmask, if (threadIdx.x == 0) { atomicAdd(global_count, block_count); } } -/** - * For each range `[first_bit_indices[i], last_bit_indices[i])` - * (where 0 <= i < `num_ranges`), count the number of bits set outside the range - * in the boundary words (i.e. words that include either - * `first_bit_indices[i]'th` bit or `(last_bit_indices[i] - 1)'th` bit) and - * subtract the count from the range's null count. - * - * Expects `0 <= first_bit_indices[i] <= last_bit_indices[i]`. - * - * @param[in] bitmask The bitmask whose non-zero bits outside the range in the - * boundary words will be counted. - * @param[in] num_ranges The number of ranges - * @param[in] first_bit_indices The indices (inclusive) of the first bit in each - * range - * @param[in] last_bit_indices The indices (exclusive) of the last bit in each - * range - * @param[in,out] null_counts The number of non-zero bits in each range to be - * updated - */ -template -__global__ void subtract_set_bits_range_boundaries_kerenel(bitmask_type const* bitmask, - size_type num_ranges, - OffsetIterator first_bit_indices, - OffsetIterator last_bit_indices, - OutputIterator null_counts) -{ - constexpr size_type const word_size_in_bits{detail::size_in_bits()}; - - cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x; - cudf::size_type range_id = tid; - - while (range_id < num_ranges) { - size_type const first_bit_index = *(first_bit_indices + range_id); - size_type const last_bit_index = *(last_bit_indices + range_id); - size_type delta = 0; - size_type num_slack_bits = 0; - - // compute delta due to the preceding bits in the first word in the range - - num_slack_bits = intra_word_index(first_bit_index); - if (num_slack_bits > 0) { - bitmask_type word = bitmask[word_index(first_bit_index)]; - bitmask_type slack_mask = set_least_significant_bits(num_slack_bits); - delta -= __popc(word & slack_mask); - } - - // compute delta due to the following bits in the last word in the range - - num_slack_bits = (last_bit_index % word_size_in_bits) == 0 - ? 0 - : word_size_in_bits - intra_word_index(last_bit_index); - if (num_slack_bits > 0) { - bitmask_type word = bitmask[word_index(last_bit_index)]; - bitmask_type slack_mask = set_most_significant_bits(num_slack_bits); - delta -= __popc(word & slack_mask); - } - - size_type updated_null_count = *(null_counts + range_id) + delta; - *(null_counts + range_id) = updated_null_count; - - range_id += blockDim.x * gridDim.x; - } -} - /** * @brief Copies the bits starting at the specified offset from a source * bitmask into the destination bitmask. @@ -319,32 +255,6 @@ __global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination, } } -// convert [first_bit_index,last_bit_index) to -// [first_word_index,last_word_index) -struct to_word_index : public thrust::unary_function { - const bool _inclusive = false; - size_type const* const _d_bit_indices = nullptr; - - /** - * @brief Constructor of a functor that converts bit indices to bitmask word - * indices. - * - * @param[in] inclusive Flag that indicates whether bit indices are inclusive - * or exclusive. - * @param[in] d_bit_indices Pointer to an array of bit indices - */ - __host__ to_word_index(bool inclusive, size_type const* d_bit_indices) - : _inclusive(inclusive), _d_bit_indices(d_bit_indices) - { - } - - __device__ size_type operator()(const size_type& i) const - { - auto bit_index = _d_bit_indices[i]; - return word_index(bit_index) + ((_inclusive || intra_word_index(bit_index) == 0) ? 0 : 1); - } -}; - } // namespace namespace detail { @@ -461,131 +371,6 @@ cudf::size_type count_unset_bits(bitmask_type const* bitmask, return (num_bits - detail::count_set_bits(bitmask, start, stop, stream)); } -std::vector segmented_count_set_bits(bitmask_type const* bitmask, - host_span indices, - rmm::cuda_stream_view stream) -{ - CUDF_EXPECTS(indices.size() % 2 == 0, - "Array of indices needs to have an even number of elements."); - for (size_t i = 0; i < indices.size() / 2; i++) { - auto begin = indices[i * 2]; - auto end = indices[i * 2 + 1]; - CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative."); - CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index."); - } - - if (indices.empty()) { - return std::vector{}; - } else if (bitmask == nullptr) { - std::vector ret(indices.size() / 2); - for (size_t i = 0; i < indices.size() / 2; i++) { - ret[i] = indices[2 * i + 1] - indices[2 * i]; - } - return ret; - } - - size_type num_ranges = indices.size() / 2; - std::vector h_first_indices(num_ranges); - std::vector h_last_indices(num_ranges); - thrust::stable_partition_copy(thrust::seq, - std::begin(indices), - std::end(indices), - thrust::make_counting_iterator(0), - h_first_indices.begin(), - h_last_indices.begin(), - [](auto i) { return (i % 2) == 0; }); - - auto d_first_indices = make_device_uvector_async(h_first_indices, stream); - auto d_last_indices = make_device_uvector_async(h_last_indices, stream); - rmm::device_uvector d_null_counts(num_ranges, stream); - - auto word_num_set_bits = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [bitmask] __device__(auto i) { return static_cast(__popc(bitmask[i])); }); - auto first_word_indices = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - // We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires - // first_word_indices and last_word_indices to have the same type. - to_word_index(true, d_first_indices.data())); - auto last_word_indices = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - // We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires - // first_word_indices and last_word_indices to have the same type. - to_word_index(false, d_last_indices.data())); - - // first allocate temporary memory - - size_t temp_storage_bytes{0}; - CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr, - temp_storage_bytes, - word_num_set_bits, - d_null_counts.begin(), - num_ranges, - first_word_indices, - last_word_indices, - stream.value())); - rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - - // second perform segmented reduction - - CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(), - temp_storage_bytes, - word_num_set_bits, - d_null_counts.begin(), - num_ranges, - first_word_indices, - last_word_indices, - stream.value())); - - CHECK_CUDA(stream.value()); - - // third, adjust counts in segment boundaries (if segments are not - // word-aligned) - - constexpr size_type block_size{256}; - - cudf::detail::grid_1d grid(num_ranges, block_size); - - subtract_set_bits_range_boundaries_kerenel<<>>( - bitmask, num_ranges, d_first_indices.begin(), d_last_indices.begin(), d_null_counts.begin()); - - CHECK_CUDA(stream.value()); - - std::vector ret(num_ranges); - CUDA_TRY(cudaMemcpyAsync(ret.data(), - d_null_counts.data(), - num_ranges * sizeof(size_type), - cudaMemcpyDeviceToHost, - stream.value())); - - stream.synchronize(); // now ret is valid. - - return ret; -} - -std::vector segmented_count_unset_bits(bitmask_type const* bitmask, - host_span indices, - rmm::cuda_stream_view stream) -{ - if (indices.empty()) { - return std::vector{}; - } else if (bitmask == nullptr) { - return std::vector(indices.size() / 2, 0); - } - - auto ret = segmented_count_set_bits(bitmask, indices, stream); - for (size_t i = 0; i < ret.size(); i++) { - auto begin = indices[i * 2]; - auto end = indices[i * 2 + 1]; - ret[i] = (end - begin) - ret[i]; - } - - return ret; -} - // Returns the bitwise AND of the null masks of all columns in the table view rmm::device_buffer bitmask_and(table_view const& view, rmm::cuda_stream_view stream, @@ -647,6 +432,29 @@ rmm::device_buffer bitmask_or(table_view const& view, return null_mask; } + +/** + * @copydoc cudf::segmented_count_set_bits + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +std::vector segmented_count_set_bits(bitmask_type const* bitmask, + host_span indices, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + return detail::segmented_count_set_bits(bitmask, indices.begin(), indices.end(), stream); +} + +// Count zero bits in the specified ranges +std::vector segmented_count_unset_bits(bitmask_type const* bitmask, + host_span indices, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + return detail::segmented_count_unset_bits(bitmask, indices.begin(), indices.end(), stream); +} + } // namespace detail // Count non-zero bits in the specified range diff --git a/cpp/src/copying/slice.cpp b/cpp/src/copying/slice.cu similarity index 82% rename from cpp/src/copying/slice.cpp rename to cpp/src/copying/slice.cu index 87ff5792384..0e41689dc4b 100644 --- a/cpp/src/copying/slice.cpp +++ b/cpp/src/copying/slice.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include @@ -36,21 +36,12 @@ std::vector slice(column_view const& input, if (indices.empty()) return {}; - auto null_counts = [&]() { - // need to shift incoming indices by the column offset to generate the correct bit ranges - // to count - if (input.offset() > 0) { - std::vector shifted_indices; - shifted_indices.reserve(indices.size()); - std::transform(indices.begin(), - indices.end(), - std::back_inserter(shifted_indices), - [offset = input.offset()](size_type index) { return index + offset; }); - return cudf::detail::segmented_count_unset_bits(input.null_mask(), shifted_indices, stream); - } - // can use the initial indices - return cudf::detail::segmented_count_unset_bits(input.null_mask(), indices, stream); - }(); + // need to shift incoming indices by the column offset to generate the correct bit ranges + // to count + auto indices_iter = cudf::detail::make_counting_transform_iterator( + 0, [offset = input.offset(), &indices](size_type index) { return indices[index] + offset; }); + auto null_counts = cudf::detail::segmented_count_unset_bits( + input.null_mask(), indices_iter, indices_iter + indices.size(), stream); auto const children = std::vector(input.child_begin(), input.child_end());