-
Notifications
You must be signed in to change notification settings - Fork 917
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve performance of hash_character_ngrams using warp-per-string kernel #16212
Changes from 24 commits
00de3e5
2a00e2e
9bb5600
b4cbafd
20c206e
3761822
84f949b
06c399b
f38ded5
2efb048
0b08a91
2496beb
f7ac0ea
fbbbe90
7bffad4
f7a2689
5e6b54c
cae4bc0
f0a9956
06925ed
6b7cfd9
4fbbf07
88476a8
9c6ce7f
b25403f
899ad1f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -36,10 +36,12 @@ | |||||
#include <rmm/exec_policy.hpp> | ||||||
#include <rmm/resource_ref.hpp> | ||||||
|
||||||
#include <cooperative_groups.h> | ||||||
#include <cooperative_groups/reduce.h> | ||||||
#include <cuda/functional> | ||||||
#include <thrust/copy.h> | ||||||
#include <thrust/functional.h> | ||||||
#include <thrust/iterator/counting_iterator.h> | ||||||
#include <thrust/transform_scan.h> | ||||||
|
||||||
#include <stdexcept> | ||||||
|
||||||
|
@@ -165,6 +167,47 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s | |||||
namespace detail { | ||||||
namespace { | ||||||
|
||||||
constexpr cudf::thread_index_type block_size = 256; | ||||||
constexpr cudf::thread_index_type bytes_per_thread = 4; | ||||||
|
||||||
/** | ||||||
* @brief Counts the number of ngrams in each row of the given strings column | ||||||
* | ||||||
* Each warp processes a single string. | ||||||
* Formula is `count = max(0,str.length() - ngrams + 1)` | ||||||
* If a string has less than ngrams characters, its count is 0. | ||||||
*/ | ||||||
CUDF_KERNEL void count_char_ngrams_kernel(cudf::column_device_view const d_strings, | ||||||
cudf::size_type ngrams, | ||||||
cudf::size_type* d_counts) | ||||||
{ | ||||||
auto const idx = cudf::detail::grid_1d::global_thread_id(); | ||||||
|
||||||
auto const str_idx = idx / cudf::detail::warp_size; | ||||||
if (str_idx >= d_strings.size()) { return; } | ||||||
if (d_strings.is_null(str_idx)) { | ||||||
d_counts[str_idx] = 0; | ||||||
return; | ||||||
} | ||||||
|
||||||
namespace cg = cooperative_groups; | ||||||
auto warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block()); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor:
Suggested change
|
||||||
|
||||||
auto const d_str = d_strings.element<cudf::string_view>(str_idx); | ||||||
auto const end = d_str.data() + d_str.size_bytes(); | ||||||
|
||||||
auto const lane_idx = warp.thread_rank(); | ||||||
cudf::size_type count = 0; | ||||||
for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end; | ||||||
itr += cudf::detail::warp_size * bytes_per_thread) { | ||||||
for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) { | ||||||
count += static_cast<cudf::size_type>(cudf::strings::detail::is_begin_utf8_char(*s)); | ||||||
} | ||||||
} | ||||||
auto const char_count = cg::reduce(warp, count, cg::plus<int>()); | ||||||
if (lane_idx == 0) { d_counts[str_idx] = std::max(0, char_count - ngrams + 1); } | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
non-blocking nit: use CUDA native functions instead of STL ones in device code since the former is hardware accelerated. |
||||||
} | ||||||
|
||||||
/** | ||||||
* @brief Generate character ngrams for each string | ||||||
* | ||||||
|
@@ -220,17 +263,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie | |||||
|
||||||
auto const d_strings = cudf::column_device_view::create(input.parent(), stream); | ||||||
|
||||||
auto sizes_itr = cudf::detail::make_counting_transform_iterator( | ||||||
0, | ||||||
cuda::proclaim_return_type<cudf::size_type>( | ||||||
[d_strings = *d_strings, ngrams] __device__(auto idx) { | ||||||
if (d_strings.is_null(idx)) { return 0; } | ||||||
auto const length = d_strings.element<cudf::string_view>(idx).length(); | ||||||
return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams)); | ||||||
})); | ||||||
auto [offsets, total_ngrams] = | ||||||
cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr); | ||||||
auto [offsets, total_ngrams] = [&] { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. neat! |
||||||
auto counts = rmm::device_uvector<cudf::size_type>(input.size(), stream); | ||||||
auto const num_blocks = cudf::util::div_rounding_up_safe( | ||||||
static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size); | ||||||
count_char_ngrams_kernel<<<num_blocks, block_size, 0, stream.value()>>>( | ||||||
*d_strings, ngrams, counts.data()); | ||||||
return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); | ||||||
}(); | ||||||
auto d_offsets = offsets->view().data<cudf::size_type>(); | ||||||
|
||||||
CUDF_EXPECTS(total_ngrams > 0, | ||||||
"Insufficient number of characters in each string to generate ngrams"); | ||||||
|
||||||
|
@@ -246,36 +288,64 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie | |||||
} | ||||||
|
||||||
namespace { | ||||||
|
||||||
/** | ||||||
* @brief Computes the hash of each character ngram | ||||||
* | ||||||
* Each thread processes a single string. Substrings are resolved for every character | ||||||
* Each warp processes a single string. Substrings are resolved for every character | ||||||
* of the string and hashed. | ||||||
*/ | ||||||
struct character_ngram_hash_fn { | ||||||
cudf::column_device_view const d_strings; | ||||||
cudf::size_type ngrams; | ||||||
cudf::size_type const* d_ngram_offsets; | ||||||
cudf::hash_value_type* d_results; | ||||||
CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings, | ||||||
cudf::size_type ngrams, | ||||||
cudf::size_type const* d_ngram_offsets, | ||||||
cudf::hash_value_type* d_results) | ||||||
{ | ||||||
auto const idx = cudf::detail::grid_1d::global_thread_id(); | ||||||
if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) { | ||||||
return; | ||||||
} | ||||||
|
||||||
__device__ void operator()(cudf::size_type idx) const | ||||||
{ | ||||||
if (d_strings.is_null(idx)) return; | ||||||
auto const d_str = d_strings.element<cudf::string_view>(idx); | ||||||
if (d_str.empty()) return; | ||||||
auto itr = d_str.begin(); | ||||||
auto const ngram_offset = d_ngram_offsets[idx]; | ||||||
auto const ngram_count = d_ngram_offsets[idx + 1] - ngram_offset; | ||||||
auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0}; | ||||||
auto d_hashes = d_results + ngram_offset; | ||||||
for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) { | ||||||
auto const begin = itr.byte_offset(); | ||||||
auto const end = (itr + ngrams).byte_offset(); | ||||||
auto const ngram = cudf::string_view(d_str.data() + begin, end - begin); | ||||||
*d_hashes++ = hasher(ngram); | ||||||
auto const str_idx = idx / cudf::detail::warp_size; | ||||||
|
||||||
if (d_strings.is_null(str_idx)) { return; } | ||||||
auto const d_str = d_strings.element<cudf::string_view>(str_idx); | ||||||
if (d_str.empty()) { return; } | ||||||
|
||||||
__shared__ cudf::hash_value_type hvs[block_size]; // temp store for hash values | ||||||
|
||||||
auto const ngram_offset = d_ngram_offsets[str_idx]; | ||||||
auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0}; | ||||||
|
||||||
auto const end = d_str.data() + d_str.size_bytes(); | ||||||
auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1; | ||||||
auto const lane_idx = idx % cudf::detail::warp_size; | ||||||
|
||||||
auto d_hashes = d_results + ngram_offset; | ||||||
auto itr = d_str.data() + lane_idx; | ||||||
for (auto i = 0; i < warp_count; ++i) { | ||||||
cudf::hash_value_type hash = 0; | ||||||
if (itr < end && cudf::strings::detail::is_begin_utf8_char(*itr)) { | ||||||
// resolve ngram substring | ||||||
auto const sub_str = | ||||||
cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end))); | ||||||
auto const [bytes, left] = | ||||||
cudf::strings::detail::bytes_to_character_position(sub_str, ngrams); | ||||||
if (left == 0) { hash = hasher(cudf::string_view(itr, bytes)); } | ||||||
} | ||||||
hvs[threadIdx.x] = hash; // store hash into shared memory | ||||||
__syncwarp(); | ||||||
if (lane_idx == 0) { | ||||||
// copy valid hash values into d_hashes | ||||||
auto const hashes = &hvs[threadIdx.x]; | ||||||
d_hashes = thrust::copy_if( | ||||||
thrust::seq, hashes, hashes + cudf::detail::warp_size, d_hashes, [](auto h) { | ||||||
return h != 0; | ||||||
}); | ||||||
} | ||||||
__syncwarp(); | ||||||
itr += cudf::detail::warp_size; | ||||||
} | ||||||
}; | ||||||
} | ||||||
} // namespace | ||||||
|
||||||
std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input, | ||||||
|
@@ -291,18 +361,16 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co | |||||
if (input.is_empty()) { return cudf::make_empty_column(output_type); } | ||||||
|
||||||
auto const d_strings = cudf::column_device_view::create(input.parent(), stream); | ||||||
auto const grid = cudf::detail::grid_1d( | ||||||
static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size); | ||||||
|
||||||
// build offsets column by computing the number of ngrams per string | ||||||
auto sizes_itr = cudf::detail::make_counting_transform_iterator( | ||||||
0, | ||||||
cuda::proclaim_return_type<cudf::size_type>( | ||||||
[d_strings = *d_strings, ngrams] __device__(auto idx) { | ||||||
if (d_strings.is_null(idx)) { return 0; } | ||||||
auto const length = d_strings.element<cudf::string_view>(idx).length(); | ||||||
return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams)); | ||||||
})); | ||||||
auto [offsets, total_ngrams] = | ||||||
cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr); | ||||||
auto [offsets, total_ngrams] = [&] { | ||||||
auto counts = rmm::device_uvector<cudf::size_type>(input.size(), stream); | ||||||
count_char_ngrams_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>( | ||||||
*d_strings, ngrams, counts.data()); | ||||||
return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); | ||||||
}(); | ||||||
auto d_offsets = offsets->view().data<cudf::size_type>(); | ||||||
|
||||||
CUDF_EXPECTS(total_ngrams > 0, | ||||||
|
@@ -313,11 +381,8 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co | |||||
cudf::make_numeric_column(output_type, total_ngrams, cudf::mask_state::UNALLOCATED, stream, mr); | ||||||
auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>(); | ||||||
|
||||||
character_ngram_hash_fn generator{*d_strings, ngrams, d_offsets, d_hashes}; | ||||||
thrust::for_each_n(rmm::exec_policy(stream), | ||||||
thrust::counting_iterator<cudf::size_type>(0), | ||||||
input.size(), | ||||||
generator); | ||||||
character_ngram_hash_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>( | ||||||
*d_strings, ngrams, d_offsets, d_hashes); | ||||||
|
||||||
return make_lists_column( | ||||||
input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr); | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for accommodating cg changes @davidwendt. Just to confirm, do the new changes affect the performance results from before in any way?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No. It looks like the performance is the same with cooperative groups as with the cub warp-reduce.