Skip to content

Commit

Permalink
Merge pull request rapidsai#15869 from galipremsagar/branch-24.08-mer…
Browse files Browse the repository at this point in the history
…ge-24.06

Manual merge of Branch 24.08 from 24.06
  • Loading branch information
AyodeAwe authored May 28, 2024
2 parents 8458306 + 4fdbc6e commit 27220d6
Show file tree
Hide file tree
Showing 14 changed files with 209 additions and 26 deletions.
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ repos:
- id: trailing-whitespace
exclude: |
(?x)^(
^cpp/cmake/thirdparty/patches/.*|
^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
)
- id: end-of-file-fixer
exclude: |
(?x)^(
^cpp/cmake/thirdparty/patches/.*|
^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
)
- repo: https://github.com/PyCQA/isort
Expand Down
20 changes: 20 additions & 0 deletions cpp/cmake/thirdparty/patches/cccl_override.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
"issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff",
"issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
"fixed_in" : ""
},
{
"file": "cccl/kernel_pointer_hiding.diff",
"issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
Expand All @@ -28,15 +33,30 @@
"issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff",
"issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
"issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff",
"issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
"issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff",
"issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
"fixed_in" : ""
}
]
}
Expand Down
47 changes: 47 additions & 0 deletions cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
index 046eb83c0..8047c9701 100644
--- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
@@ -53,41 +53,15 @@ namespace cuda_cub

namespace __copy
{
-template <class Derived, class InputIt, class OutputIt>
-OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
- execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type)
-{
- typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
- const auto n = thrust::distance(first, last);
- if (n > 0)
- {
- cudaError status;
- status = trivial_copy_device_to_device(
- policy,
- reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
- reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*first)),
- n);
- cuda_cub::throw_on_error(status, "__copy:: D->D: failed");
- }
-
- return result + n;
-}

template <class Derived, class InputIt, class OutputIt>
OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
- execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type)
+ execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
{
typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
return cuda_cub::transform(policy, first, last, result, thrust::identity<InputTy>());
}

-template <class Derived, class InputIt, class OutputIt>
-OutputIt THRUST_RUNTIME_FUNCTION
-device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
-{
- return device_to_device(
- policy, first, last, result, typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
-}
} // namespace __copy

} // namespace cuda_cub
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
index 2a3cc4e33..8fb337b26 100644
--- a/thrust/thrust/system/cuda/detail/dispatch.h
+++ b/thrust/thrust/system/cuda/detail/dispatch.h
@@ -44,8 +44,7 @@
} \
else \
{ \
- auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
- status = call arguments; \
+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
}

/**
@@ -66,9 +65,7 @@
} \
else \
{ \
- auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \
- auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \
- status = call arguments; \
+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
}
/**
* Dispatch between 32-bit and 64-bit index based versions of the same algorithm
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index 0606485bb..dbb99ff13 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy
};

/// SM60 (GP100)
- struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
{
enum
{
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index f39613adb..75bd16ff9 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -488,7 +488,7 @@ struct DeviceReducePolicy
};

/// SM60
- struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
{
static constexpr int threads_per_block = 256;
static constexpr int items_per_thread = 16;
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 419908c4e..6ab0840e1 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -339,7 +339,7 @@ struct DeviceScanPolicy
/// SM600
struct Policy600
: DefaultTuning
- , ChainedPolicy<600, Policy600, Policy520>
+ , ChainedPolicy<600, Policy600, Policy600>
{};

/// SM800
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
index eb76ebb0b..c6c529a50 100644
--- a/cub/cub/block/block_merge_sort.cuh
+++ b/cub/cub/block/block_merge_sort.cuh
@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
KeyT key1 = keys_shared[keys1_beg];
KeyT key2 = keys_shared[keys2_beg];

-#pragma unroll
+#pragma unroll 1
for (int item = 0; item < ITEMS_PER_THREAD; ++item)
{
bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
@@ -376,7 +376,7 @@ public:
//
KeyT max_key = oob_default;

-#pragma unroll
+#pragma unroll 1
for (int item = 1; item < ITEMS_PER_THREAD; ++item)
{
if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
index 7d9e8622f..da5627306 100644
--- a/cub/cub/thread/thread_sort.cuh
+++ b/cub/cub/thread/thread_sort.cuh
@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
{
constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;

-#pragma unroll
+#pragma unroll 1
for (int i = 0; i < ITEMS_PER_THREAD; ++i)
{
-#pragma unroll
+#pragma unroll 1
for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
{
if (compare_op(keys[j + 1], keys[j]))
9 changes: 5 additions & 4 deletions cpp/src/io/comp/statistics.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <rmm/exec_policy.hpp>

#include <cuda/functional>
#include <thrust/transform_reduce.h>

namespace cudf::io {
Expand All @@ -32,9 +33,9 @@ writer_compression_statistics collect_compression_statistics(
rmm::exec_policy(stream),
results.begin(),
results.end(),
[] __device__(auto& res) {
cuda::proclaim_return_type<size_t>([] __device__(compression_result const& res) {
return res.status == compression_status::SUCCESS ? res.bytes_written : 0;
},
}),
0ul,
thrust::plus<size_t>());

Expand All @@ -47,9 +48,9 @@ writer_compression_statistics collect_compression_statistics(
rmm::exec_policy(stream),
zipped_begin,
zipped_end,
[status] __device__(auto tup) {
cuda::proclaim_return_type<size_t>([status] __device__(auto tup) {
return thrust::get<1>(tup).status == status ? thrust::get<0>(tup).size() : 0;
},
}),
0ul,
thrust::plus<size_t>());
};
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/io/orc/reader_impl_decode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -692,8 +692,7 @@ std::vector<range> find_table_splits(table_view const& input,
d_sizes = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
// Since the number of rows may not divisible by segment_length,
// the last segment may be shorter than the others.
auto const current_length =
cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
auto const current_length = min(segment_length, num_rows - segment_length * segment_idx);
auto const size = d_sizes[segment_idx] / CHAR_BIT; // divide by CHAR_BIT to get size in bytes
return cumulative_size{static_cast<std::size_t>(current_length),
static_cast<std::size_t>(size)};
Expand Down
22 changes: 14 additions & 8 deletions cpp/src/io/orc/stripe_init.cu
Original file line number Diff line number Diff line change
Expand Up @@ -561,20 +561,26 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
uint32_t log2maxcr,
rmm::cuda_stream_view stream)
{
dim3 dim_block(128, 1);
dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block
gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
strm_info, num_streams, compression_block_size, log2maxcr);
auto const num_blocks = (num_streams + 3) >> 2; // 1 stream per warp, 4 warps per block
if (num_blocks > 0) {
dim3 dim_block(128, 1);
dim3 dim_grid(num_blocks, 1);
gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
strm_info, num_streams, compression_block_size, log2maxcr);
}
}

void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
int32_t num_streams,
rmm::cuda_stream_view stream)
{
dim3 dim_block(128, 1);
dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block
gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
num_streams);
auto const num_blocks = (num_streams + 3) >> 2; // 1 stream per warp, 4 warps per block
if (num_blocks > 0) {
dim3 dim_block(128, 1);
dim3 dim_grid(num_blocks, 1);
gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
num_streams);
}
}

void __host__ ParseRowGroupIndex(RowGroup* row_groups,
Expand Down
13 changes: 8 additions & 5 deletions cpp/src/io/parquet/page_string_decode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1197,14 +1197,17 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span<PageInfo> pages,
cudf::detail::join_streams(streams, stream);

// check for needed temp space for DELTA_BYTE_ARRAY
auto const need_sizes = thrust::any_of(
rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), [] __device__(auto& page) {
return page.temp_string_size != 0;
});
auto const need_sizes =
thrust::any_of(rmm::exec_policy(stream),
pages.device_begin(),
pages.device_end(),
cuda::proclaim_return_type<bool>(
[] __device__(auto& page) { return page.temp_string_size != 0; }));

if (need_sizes) {
// sum up all of the temp_string_sizes
auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; };
auto const page_sizes = cuda::proclaim_return_type<int64_t>(
[] __device__(PageInfo const& page) { return page.temp_string_size; });
auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream),
pages.device_begin(),
pages.device_end(),
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/parquet/reader_impl_preprocess.cu
Original file line number Diff line number Diff line change
Expand Up @@ -452,9 +452,9 @@ std::string encoding_to_string(Encoding encoding)
[[nodiscard]] std::string list_unsupported_encodings(device_span<PageInfo const> pages,
rmm::cuda_stream_view stream)
{
auto const to_mask = [] __device__(auto const& page) {
auto const to_mask = cuda::proclaim_return_type<uint32_t>([] __device__(auto const& page) {
return is_supported_encoding(page.encoding) ? 0U : encoding_to_mask(page.encoding);
};
});
uint32_t const unsupported = thrust::transform_reduce(
rmm::exec_policy(stream), pages.begin(), pages.end(), to_mask, 0U, thrust::bit_or<uint32_t>());
return encoding_bitmask_to_str(unsupported);
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/io/utilities/data_casting.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <rmm/resource_ref.hpp>

#include <cub/cub.cuh>
#include <cuda/functional>
#include <thrust/copy.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
Expand Down Expand Up @@ -783,7 +784,8 @@ template <typename SymbolT>
struct to_string_view_pair {
SymbolT const* data;
to_string_view_pair(SymbolT const* _data) : data(_data) {}
__device__ auto operator()(thrust::tuple<size_type, size_type> ip)
__device__ thrust::pair<char const*, std::size_t> operator()(
thrust::tuple<size_type, size_type> ip)
{
return thrust::pair<char const*, std::size_t>{data + thrust::get<0>(ip),
static_cast<std::size_t>(thrust::get<1>(ip))};
Expand All @@ -805,7 +807,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
rmm::exec_policy(stream),
str_tuples,
str_tuples + col_size,
[] __device__(auto t) { return t.second; },
cuda::proclaim_return_type<std::size_t>([] __device__(auto t) { return t.second; }),
size_type{0},
thrust::maximum<size_type>{});

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/join/distinct_hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});

auto const [probe_indices_end, _] = this->_hash_table.retrieve(
iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, stream.value());
iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()});

auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
build_indices->resize(actual_size, stream);
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/strings/split/split_re.cu
Original file line number Diff line number Diff line change
Expand Up @@ -219,9 +219,9 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
[d_offsets] __device__(auto const idx) -> size_type {
cuda::proclaim_return_type<size_type>([d_offsets] __device__(auto const idx) -> size_type {
return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
},
}),
0,
thrust::maximum<size_type>{});

Expand Down

0 comments on commit 27220d6

Please sign in to comment.