Skip to content

Commit

Permalink
Use cuda::proclaim_return_type on device lambdas. (#14577)
Browse files Browse the repository at this point in the history
This PR makes cudf compatible with Thrust 2 by adding `cuda::proclaim_return_type`, but does not upgrade the version of Thrust/CCCL just yet. Currently we use libcudacxx 2.1.0, which makes `cuda::proclaim_return_type` available, but we still use Thrust 1.17.2 which doesn't require device lambdas to have proclaimed return types. This diff is separated out from #14576 and reduces the diff we must carry in #14576 -- which should contain only packaging changes / version updates for the CCCL 2 migration. This PR is **nonbreaking**, while #14576 will be **breaking**.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: #14577
  • Loading branch information
bdice authored Dec 8, 2023
1 parent 597a535 commit 7b551ef
Show file tree
Hide file tree
Showing 77 changed files with 1,524 additions and 1,136 deletions.
21 changes: 12 additions & 9 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
#include <thrust/transform.h>
#include <thrust/tuple.h>

#include <cuda/functional>

#include <algorithm>
#include <cstdint>
#include <memory>
Expand Down Expand Up @@ -247,12 +249,12 @@ struct random_value_fn<T, std::enable_if_t<cudf::is_chrono<T>()>> {
sec.end(),
ns.begin(),
result.begin(),
[] __device__(int64_t sec_value, int64_t nanoseconds_value) {
cuda::proclaim_return_type<T>([] __device__(int64_t sec_value, int64_t nanoseconds_value) {
auto const timestamp_ns =
cudf::duration_s{sec_value} + cudf::duration_ns{nanoseconds_value};
// Return value in the type's precision
return T(cuda::std::chrono::duration_cast<typename T::duration>(timestamp_ns));
});
}));
return result;
}
};
Expand Down Expand Up @@ -367,12 +369,13 @@ rmm::device_uvector<cudf::size_type> sample_indices_with_run_length(cudf::size_t
// This is gather.
auto avg_repeated_sample_indices_iterator = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[rb = run_lens.begin(),
re = run_lens.end(),
samples_indices = samples_indices.begin()] __device__(cudf::size_type i) {
auto sample_idx = thrust::upper_bound(thrust::seq, rb, re, i) - rb;
return samples_indices[sample_idx];
});
cuda::proclaim_return_type<cudf::size_type>(
[rb = run_lens.begin(),
re = run_lens.end(),
samples_indices = samples_indices.begin()] __device__(cudf::size_type i) {
auto sample_idx = thrust::upper_bound(thrust::seq, rb, re, i) - rb;
return samples_indices[sample_idx];
}));
rmm::device_uvector<cudf::size_type> repeated_sample_indices(num_rows,
cudf::get_default_stream());
thrust::copy(thrust::device,
Expand Down Expand Up @@ -513,7 +516,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
lengths.end(),
null_mask.begin(),
lengths.begin(),
[] __device__(auto) { return 0; },
cuda::proclaim_return_type<cudf::size_type>([] __device__(auto) { return 0; }),
thrust::logical_not<bool>{});
auto valid_lengths = thrust::make_transform_iterator(
thrust::make_zip_iterator(thrust::make_tuple(lengths.begin(), null_mask.begin())),
Expand Down
5 changes: 4 additions & 1 deletion cpp/include/cudf/column/column_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,10 @@ class mutable_column_view : public detail::column_view_base {
public:
mutable_column_view() = default;

~mutable_column_view() = default;
~mutable_column_view(){
// Needed so that the first instance of the implicit destructor for any TU isn't 'constructed'
// from a host+device function marking the implicit version also as host+device
};

mutable_column_view(mutable_column_view const&) = default; ///< Copy constructor
mutable_column_view(mutable_column_view&&) = default; ///< Move constructor
Expand Down
21 changes: 12 additions & 9 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
#include <thrust/transform.h>
#include <thrust/tuple.h>

#include <cuda/functional>

#include <algorithm>
#include <iterator>
#include <optional>
Expand Down Expand Up @@ -330,20 +332,21 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
// set bits from the length of the segment.
auto segments_begin =
thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin);
auto segment_length_iterator =
thrust::transform_iterator(segments_begin, [] __device__(auto const& segment) {
auto segment_length_iterator = thrust::transform_iterator(
segments_begin, cuda::proclaim_return_type<size_type>([] __device__(auto const& segment) {
auto const begin = thrust::get<0>(segment);
auto const end = thrust::get<1>(segment);
return end - begin;
});
}));
thrust::transform(rmm::exec_policy(stream),
segment_length_iterator,
segment_length_iterator + num_ranges,
d_bit_counts.data(),
d_bit_counts.data(),
[] __device__(auto segment_size, auto segment_bit_count) {
return segment_size - segment_bit_count;
});
cuda::proclaim_return_type<size_type>(
[] __device__(auto segment_size, auto segment_bit_count) {
return segment_size - segment_bit_count;
}));
}

CUDF_CHECK_CUDA(stream.value());
Expand Down Expand Up @@ -541,12 +544,12 @@ std::pair<rmm::device_buffer, size_type> segmented_null_mask_reduction(
{
auto const segments_begin =
thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin);
auto const segment_length_iterator =
thrust::make_transform_iterator(segments_begin, [] __device__(auto const& segment) {
auto const segment_length_iterator = thrust::make_transform_iterator(
segments_begin, cuda::proclaim_return_type<size_type>([] __device__(auto const& segment) {
auto const begin = thrust::get<0>(segment);
auto const end = thrust::get<1>(segment);
return end - begin;
});
}));

auto const num_segments =
static_cast<size_type>(std::distance(first_bit_indices_begin, first_bit_indices_end));
Expand Down
9 changes: 6 additions & 3 deletions cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include <thrust/distance.h>
#include <thrust/scan.h>

#include <cuda/functional>

#include <stdexcept>

namespace cudf {
Expand Down Expand Up @@ -311,9 +313,10 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
// using exclusive-scan technically requires count+1 input values even though
// the final input value is never used.
// The input iterator is wrapped here to allow the last value to be safely read.
auto map_fn = [begin, count] __device__(size_type idx) -> size_type {
return idx < count ? static_cast<size_type>(begin[idx]) : size_type{0};
};
auto map_fn =
cuda::proclaim_return_type<size_type>([begin, count] __device__(size_type idx) -> size_type {
return idx < count ? static_cast<size_type>(begin[idx]) : size_type{0};
});
auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
// Use the sizes-to-offsets iterator to compute the total number of elements
auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream);
Expand Down
63 changes: 63 additions & 0 deletions cpp/include/cudf/detail/utilities/cast_functor.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

/**
* @brief A casting functor wrapping another functor.
* @file
*/

#include <cudf/types.hpp>

#include <cuda/functional>

#include <type_traits>
#include <utility>

namespace cudf {
namespace detail {

/**
* @brief Functor that casts another functor's result to a specified type.
*
* CUB 2.0.0 reductions require that the binary operator returns the same type
* as the initial value type, so we wrap binary operators with this when used
* by CUB.
*/
template <typename ResultType, typename F>
struct cast_functor_fn {
F f;

template <typename... Ts>
CUDF_HOST_DEVICE inline ResultType operator()(Ts&&... args)
{
return static_cast<ResultType>(f(std::forward<Ts>(args)...));
}
};

/**
* @brief Function creating a casting functor.
*/
template <typename ResultType, typename F>
inline cast_functor_fn<ResultType, std::decay_t<F>> cast_functor(F&& f)
{
return cast_functor_fn<ResultType, std::decay_t<F>>{std::forward<F>(f)};
}

} // namespace detail

} // namespace cudf
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/utilities/element_argminmax.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,7 +37,7 @@ struct element_argminmax_fn {
bool const has_nulls;
bool const arg_min;

__device__ inline auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
__device__ inline size_type operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
{
// The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
// github.com/NVIDIA/thrust/issues/1525
Expand Down
16 changes: 9 additions & 7 deletions cpp/include/cudf/lists/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>

#include <cuda/functional>

namespace cudf {
namespace lists {
namespace detail {
Expand Down Expand Up @@ -83,12 +85,12 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,

auto sizes_itr = cudf::detail::make_counting_transform_iterator(
0,
[source_column_nullmask,
source_column_offset = source_column.offset(),
gather_map,
output_count,
src_offsets,
src_size] __device__(int32_t index) -> int32_t {
cuda::proclaim_return_type<int32_t>([source_column_nullmask,
source_column_offset = source_column.offset(),
gather_map,
output_count,
src_offsets,
src_size] __device__(int32_t index) -> int32_t {
int32_t offset_index = index < output_count ? gather_map[index] : 0;

// if this is an invalid index, this will be a NULL list
Expand All @@ -102,7 +104,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,

// the length of this list
return src_offsets[offset_index + 1] - src_offsets[offset_index];
});
}));

auto [dst_offsets_c, map_size] =
cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + output_count, stream, mr);
Expand Down
12 changes: 8 additions & 4 deletions cpp/include/cudf/lists/detail/scatter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
#include <thrust/sequence.h>
#include <thrust/transform.h>

#include <cuda/functional>

#include <cinttypes>

namespace cudf {
Expand All @@ -62,9 +64,10 @@ rmm::device_uvector<unbound_list_view> list_vector_from_column(
index_begin,
index_end,
vector.begin(),
[label, lists_column] __device__(size_type row_index) {
return unbound_list_view{label, lists_column, row_index};
});
cuda::proclaim_return_type<unbound_list_view>(
[label, lists_column] __device__(size_type row_index) {
return unbound_list_view{label, lists_column, row_index};
}));

return vector;
}
Expand Down Expand Up @@ -115,7 +118,8 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
lists_column_view(target); // Checks that target is a list column.

auto list_size_begin = thrust::make_transform_iterator(
target_vector.begin(), [] __device__(unbound_list_view l) { return l.size(); });
target_vector.begin(),
cuda::proclaim_return_type<size_type>([] __device__(unbound_list_view l) { return l.size(); }));
auto offsets_column = std::get<0>(cudf::detail::make_offsets_child_column(
list_size_begin, list_size_begin + target.size(), stream, mr));

Expand Down
7 changes: 4 additions & 3 deletions cpp/include/cudf/reduction/detail/reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "reduction_operators.cuh"

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/utilities/cast_functor.cuh>
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand Down Expand Up @@ -64,7 +65,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto const binary_op = op.get_binary_op();
auto const binary_op = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
auto const initial_value = init.value_or(op.template get_identity<OutputType>());
auto dev_result = rmm::device_scalar<OutputType>{initial_value, stream, mr};

Expand Down Expand Up @@ -124,7 +125,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto const binary_op = op.get_binary_op();
auto const binary_op = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
auto const initial_value = init.value_or(op.template get_identity<OutputType>());
auto dev_result = rmm::device_scalar<OutputType>{initial_value, stream};

Expand Down Expand Up @@ -190,7 +191,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto const binary_op = op.get_binary_op();
auto const binary_op = cudf::detail::cast_functor<IntermediateType>(op.get_binary_op());
auto const initial_value = op.template get_identity<IntermediateType>();

rmm::device_scalar<IntermediateType> intermediate_result{initial_value, stream};
Expand Down
10 changes: 6 additions & 4 deletions cpp/include/cudf/reduction/detail/segmented_reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

#include "reduction_operators.cuh"

#include <cudf/detail/utilities/cast_functor.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/exec_policy.hpp>
Expand Down Expand Up @@ -45,7 +47,7 @@ namespace detail {
* @param d_offset_begin Begin iterator to segment indices
* @param d_offset_end End iterator to segment indices
* @param d_out Output data iterator
* @param binary_op The reduction operator
* @param op The reduction operator
* @param initial_value Initial value of the reduction
* @param stream CUDA stream used for device memory operations and kernel launches
*
Expand All @@ -61,12 +63,12 @@ void segmented_reduce(InputIterator d_in,
OffsetIterator d_offset_begin,
OffsetIterator d_offset_end,
OutputIterator d_out,
BinaryOp binary_op,
BinaryOp op,
OutputType initial_value,
rmm::cuda_stream_view stream)
{
auto const num_segments = static_cast<size_type>(std::distance(d_offset_begin, d_offset_end)) - 1;

auto const binary_op = cudf::detail::cast_functor<OutputType>(op);
// Allocate temporary storage
size_t temp_storage_bytes = 0;
cub::DeviceSegmentedReduce::Reduce(nullptr,
Expand Down Expand Up @@ -148,8 +150,8 @@ void segmented_reduce(InputIterator d_in,
using OutputType = typename thrust::iterator_value<OutputIterator>::type;
using IntermediateType = typename thrust::iterator_value<InputIterator>::type;
auto num_segments = static_cast<size_type>(std::distance(d_offset_begin, d_offset_end)) - 1;
auto const binary_op = op.get_binary_op();
auto const initial_value = op.template get_identity<IntermediateType>();
auto const binary_op = cudf::detail::cast_functor<IntermediateType>(op.get_binary_op());

rmm::device_uvector<IntermediateType> intermediate_result{static_cast<std::size_t>(num_segments),
stream};
Expand Down
11 changes: 7 additions & 4 deletions cpp/include/cudf/strings/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include <thrust/iterator/transform_iterator.h>
#include <thrust/optional.h>

#include <cuda/functional>

namespace cudf {
namespace strings {
namespace detail {
Expand Down Expand Up @@ -78,10 +80,11 @@ std::unique_ptr<cudf::column> copy_if_else(StringIterLeft lhs_begin,
auto null_mask = (null_count > 0) ? std::move(valid_mask.first) : rmm::device_buffer{};

// build offsets column
auto offsets_transformer = [lhs_begin, rhs_begin, filter_fn] __device__(size_type idx) {
auto const result = filter_fn(idx) ? lhs_begin[idx] : rhs_begin[idx];
return result.has_value() ? result->size_bytes() : 0;
};
auto offsets_transformer = cuda::proclaim_return_type<size_type>(
[lhs_begin, rhs_begin, filter_fn] __device__(size_type idx) {
auto const result = filter_fn(idx) ? lhs_begin[idx] : rhs_begin[idx];
return result.has_value() ? result->size_bytes() : 0;
});

auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0), offsets_transformer);
Expand Down
Loading

0 comments on commit 7b551ef

Please sign in to comment.