Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use cuda::proclaim_return_type on device lambdas. #14577

Merged
merged 4 commits into from
Dec 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
#include <thrust/transform.h>
#include <thrust/tuple.h>

#include <cuda/functional>

#include <algorithm>
#include <cstdint>
#include <memory>
Expand Down Expand Up @@ -247,12 +249,12 @@ struct random_value_fn<T, std::enable_if_t<cudf::is_chrono<T>()>> {
sec.end(),
ns.begin(),
result.begin(),
[] __device__(int64_t sec_value, int64_t nanoseconds_value) {
cuda::proclaim_return_type<T>([] __device__(int64_t sec_value, int64_t nanoseconds_value) {
auto const timestamp_ns =
cudf::duration_s{sec_value} + cudf::duration_ns{nanoseconds_value};
// Return value in the type's precision
return T(cuda::std::chrono::duration_cast<typename T::duration>(timestamp_ns));
});
}));
return result;
}
};
Expand Down Expand Up @@ -367,12 +369,13 @@ rmm::device_uvector<cudf::size_type> sample_indices_with_run_length(cudf::size_t
// This is gather.
auto avg_repeated_sample_indices_iterator = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[rb = run_lens.begin(),
re = run_lens.end(),
samples_indices = samples_indices.begin()] __device__(cudf::size_type i) {
auto sample_idx = thrust::upper_bound(thrust::seq, rb, re, i) - rb;
return samples_indices[sample_idx];
});
cuda::proclaim_return_type<cudf::size_type>(
[rb = run_lens.begin(),
re = run_lens.end(),
samples_indices = samples_indices.begin()] __device__(cudf::size_type i) {
auto sample_idx = thrust::upper_bound(thrust::seq, rb, re, i) - rb;
return samples_indices[sample_idx];
}));
rmm::device_uvector<cudf::size_type> repeated_sample_indices(num_rows,
cudf::get_default_stream());
thrust::copy(thrust::device,
Expand Down Expand Up @@ -513,7 +516,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
lengths.end(),
null_mask.begin(),
lengths.begin(),
[] __device__(auto) { return 0; },
cuda::proclaim_return_type<cudf::size_type>([] __device__(auto) { return 0; }),
thrust::logical_not<bool>{});
auto valid_lengths = thrust::make_transform_iterator(
thrust::make_zip_iterator(thrust::make_tuple(lengths.begin(), null_mask.begin())),
Expand Down
5 changes: 4 additions & 1 deletion cpp/include/cudf/column/column_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,10 @@ class mutable_column_view : public detail::column_view_base {
public:
mutable_column_view() = default;

~mutable_column_view() = default;
~mutable_column_view(){
// Needed so that the first instance of the implicit destructor for any TU isn't 'constructed'
// from a host+device function marking the implicit version also as host+device
Comment on lines +482 to +483
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wat :D

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a comment we acquired with help from @robertmaynard. I was suffering from inexplicable compilation problems and this fixed it.

};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will trigger IDE warning.

Suggested change
};
}


mutable_column_view(mutable_column_view const&) = default; ///< Copy constructor
mutable_column_view(mutable_column_view&&) = default; ///< Move constructor
Expand Down
21 changes: 12 additions & 9 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
#include <thrust/transform.h>
#include <thrust/tuple.h>

#include <cuda/functional>

#include <algorithm>
#include <iterator>
#include <optional>
Expand Down Expand Up @@ -330,20 +332,21 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
// set bits from the length of the segment.
auto segments_begin =
thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin);
auto segment_length_iterator =
thrust::transform_iterator(segments_begin, [] __device__(auto const& segment) {
auto segment_length_iterator = thrust::transform_iterator(
segments_begin, cuda::proclaim_return_type<size_type>([] __device__(auto const& segment) {
auto const begin = thrust::get<0>(segment);
auto const end = thrust::get<1>(segment);
return end - begin;
});
}));
thrust::transform(rmm::exec_policy(stream),
segment_length_iterator,
segment_length_iterator + num_ranges,
d_bit_counts.data(),
d_bit_counts.data(),
[] __device__(auto segment_size, auto segment_bit_count) {
return segment_size - segment_bit_count;
});
cuda::proclaim_return_type<size_type>(
[] __device__(auto segment_size, auto segment_bit_count) {
return segment_size - segment_bit_count;
}));
}

CUDF_CHECK_CUDA(stream.value());
Expand Down Expand Up @@ -541,12 +544,12 @@ std::pair<rmm::device_buffer, size_type> segmented_null_mask_reduction(
{
auto const segments_begin =
thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin);
auto const segment_length_iterator =
thrust::make_transform_iterator(segments_begin, [] __device__(auto const& segment) {
auto const segment_length_iterator = thrust::make_transform_iterator(
segments_begin, cuda::proclaim_return_type<size_type>([] __device__(auto const& segment) {
auto const begin = thrust::get<0>(segment);
auto const end = thrust::get<1>(segment);
return end - begin;
});
}));

auto const num_segments =
static_cast<size_type>(std::distance(first_bit_indices_begin, first_bit_indices_end));
Expand Down
9 changes: 6 additions & 3 deletions cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include <thrust/distance.h>
#include <thrust/scan.h>

#include <cuda/functional>

#include <stdexcept>

namespace cudf {
Expand Down Expand Up @@ -311,9 +313,10 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
// using exclusive-scan technically requires count+1 input values even though
// the final input value is never used.
// The input iterator is wrapped here to allow the last value to be safely read.
auto map_fn = [begin, count] __device__(size_type idx) -> size_type {
return idx < count ? static_cast<size_type>(begin[idx]) : size_type{0};
};
auto map_fn =
cuda::proclaim_return_type<size_type>([begin, count] __device__(size_type idx) -> size_type {
return idx < count ? static_cast<size_type>(begin[idx]) : size_type{0};
});
auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
// Use the sizes-to-offsets iterator to compute the total number of elements
auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream);
Expand Down
63 changes: 63 additions & 0 deletions cpp/include/cudf/detail/utilities/cast_functor.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

/**
* @brief A casting functor wrapping another functor.
* @file
*/

#include <cudf/types.hpp>

#include <cuda/functional>

#include <type_traits>
#include <utility>

namespace cudf {
namespace detail {

/**
* @brief Functor that casts another functor's result to a specified type.
*
* CUB 2.0.0 reductions require that the binary operator returns the same type
* as the initial value type, so we wrap binary operators with this when used
* by CUB.
*/
template <typename ResultType, typename F>
struct cast_functor_fn {
F f;

template <typename... Ts>
CUDF_HOST_DEVICE inline ResultType operator()(Ts&&... args)
{
return static_cast<ResultType>(f(std::forward<Ts>(args)...));
}
};

/**
* @brief Function creating a casting functor.
*/
template <typename ResultType, typename F>
inline cast_functor_fn<ResultType, std::decay_t<F>> cast_functor(F&& f)
{
return cast_functor_fn<ResultType, std::decay_t<F>>{std::forward<F>(f)};
}

} // namespace detail

} // namespace cudf
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/utilities/element_argminmax.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,7 +37,7 @@ struct element_argminmax_fn {
bool const has_nulls;
bool const arg_min;

__device__ inline auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
__device__ inline size_type operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
{
// The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
// github.com/NVIDIA/thrust/issues/1525
Expand Down
16 changes: 9 additions & 7 deletions cpp/include/cudf/lists/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>

#include <cuda/functional>

namespace cudf {
namespace lists {
namespace detail {
Expand Down Expand Up @@ -83,12 +85,12 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,

auto sizes_itr = cudf::detail::make_counting_transform_iterator(
0,
[source_column_nullmask,
source_column_offset = source_column.offset(),
gather_map,
output_count,
src_offsets,
src_size] __device__(int32_t index) -> int32_t {
cuda::proclaim_return_type<int32_t>([source_column_nullmask,
source_column_offset = source_column.offset(),
gather_map,
output_count,
src_offsets,
src_size] __device__(int32_t index) -> int32_t {
int32_t offset_index = index < output_count ? gather_map[index] : 0;

// if this is an invalid index, this will be a NULL list
Expand All @@ -102,7 +104,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,

// the length of this list
return src_offsets[offset_index + 1] - src_offsets[offset_index];
});
}));

auto [dst_offsets_c, map_size] =
cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + output_count, stream, mr);
Expand Down
12 changes: 8 additions & 4 deletions cpp/include/cudf/lists/detail/scatter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
#include <thrust/sequence.h>
#include <thrust/transform.h>

#include <cuda/functional>

#include <cinttypes>

namespace cudf {
Expand All @@ -62,9 +64,10 @@ rmm::device_uvector<unbound_list_view> list_vector_from_column(
index_begin,
index_end,
vector.begin(),
[label, lists_column] __device__(size_type row_index) {
return unbound_list_view{label, lists_column, row_index};
});
cuda::proclaim_return_type<unbound_list_view>(
[label, lists_column] __device__(size_type row_index) {
return unbound_list_view{label, lists_column, row_index};
}));

return vector;
}
Expand Down Expand Up @@ -115,7 +118,8 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
lists_column_view(target); // Checks that target is a list column.

auto list_size_begin = thrust::make_transform_iterator(
target_vector.begin(), [] __device__(unbound_list_view l) { return l.size(); });
target_vector.begin(),
cuda::proclaim_return_type<size_type>([] __device__(unbound_list_view l) { return l.size(); }));
auto offsets_column = std::get<0>(cudf::detail::make_offsets_child_column(
list_size_begin, list_size_begin + target.size(), stream, mr));

Expand Down
7 changes: 4 additions & 3 deletions cpp/include/cudf/reduction/detail/reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "reduction_operators.cuh"

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/utilities/cast_functor.cuh>
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand Down Expand Up @@ -64,7 +65,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto const binary_op = op.get_binary_op();
auto const binary_op = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
auto const initial_value = init.value_or(op.template get_identity<OutputType>());
auto dev_result = rmm::device_scalar<OutputType>{initial_value, stream, mr};

Expand Down Expand Up @@ -124,7 +125,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto const binary_op = op.get_binary_op();
auto const binary_op = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
auto const initial_value = init.value_or(op.template get_identity<OutputType>());
auto dev_result = rmm::device_scalar<OutputType>{initial_value, stream};

Expand Down Expand Up @@ -190,7 +191,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto const binary_op = op.get_binary_op();
auto const binary_op = cudf::detail::cast_functor<IntermediateType>(op.get_binary_op());
auto const initial_value = op.template get_identity<IntermediateType>();

rmm::device_scalar<IntermediateType> intermediate_result{initial_value, stream};
Expand Down
10 changes: 6 additions & 4 deletions cpp/include/cudf/reduction/detail/segmented_reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

#include "reduction_operators.cuh"

#include <cudf/detail/utilities/cast_functor.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/exec_policy.hpp>
Expand Down Expand Up @@ -45,7 +47,7 @@ namespace detail {
* @param d_offset_begin Begin iterator to segment indices
* @param d_offset_end End iterator to segment indices
* @param d_out Output data iterator
* @param binary_op The reduction operator
* @param op The reduction operator
* @param initial_value Initial value of the reduction
* @param stream CUDA stream used for device memory operations and kernel launches
*
Expand All @@ -61,12 +63,12 @@ void segmented_reduce(InputIterator d_in,
OffsetIterator d_offset_begin,
OffsetIterator d_offset_end,
OutputIterator d_out,
BinaryOp binary_op,
BinaryOp op,
OutputType initial_value,
rmm::cuda_stream_view stream)
{
auto const num_segments = static_cast<size_type>(std::distance(d_offset_begin, d_offset_end)) - 1;

auto const binary_op = cudf::detail::cast_functor<OutputType>(op);
// Allocate temporary storage
size_t temp_storage_bytes = 0;
cub::DeviceSegmentedReduce::Reduce(nullptr,
Expand Down Expand Up @@ -148,8 +150,8 @@ void segmented_reduce(InputIterator d_in,
using OutputType = typename thrust::iterator_value<OutputIterator>::type;
using IntermediateType = typename thrust::iterator_value<InputIterator>::type;
auto num_segments = static_cast<size_type>(std::distance(d_offset_begin, d_offset_end)) - 1;
auto const binary_op = op.get_binary_op();
auto const initial_value = op.template get_identity<IntermediateType>();
auto const binary_op = cudf::detail::cast_functor<IntermediateType>(op.get_binary_op());

rmm::device_uvector<IntermediateType> intermediate_result{static_cast<std::size_t>(num_segments),
stream};
Expand Down
11 changes: 7 additions & 4 deletions cpp/include/cudf/strings/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include <thrust/iterator/transform_iterator.h>
#include <thrust/optional.h>

#include <cuda/functional>

namespace cudf {
namespace strings {
namespace detail {
Expand Down Expand Up @@ -78,10 +80,11 @@ std::unique_ptr<cudf::column> copy_if_else(StringIterLeft lhs_begin,
auto null_mask = (null_count > 0) ? std::move(valid_mask.first) : rmm::device_buffer{};

// build offsets column
auto offsets_transformer = [lhs_begin, rhs_begin, filter_fn] __device__(size_type idx) {
auto const result = filter_fn(idx) ? lhs_begin[idx] : rhs_begin[idx];
return result.has_value() ? result->size_bytes() : 0;
};
auto offsets_transformer = cuda::proclaim_return_type<size_type>(
[lhs_begin, rhs_begin, filter_fn] __device__(size_type idx) {
auto const result = filter_fn(idx) ? lhs_begin[idx] : rhs_begin[idx];
return result.has_value() ? result->size_bytes() : 0;
});

auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0), offsets_transformer);
Expand Down
Loading