Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update strings/text source to use grid_1d for thread/block/stride calculations #17404

Merged
merged 5 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,15 @@ CUDF_KERNEL void gather_chars_fn_string_parallel(StringIterator strings_begin,
constexpr size_t out_datatype_size = sizeof(uint4);
constexpr size_t in_datatype_size = sizeof(uint);

int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
int global_warp_id = global_thread_id / cudf::detail::warp_size;
int warp_lane = global_thread_id % cudf::detail::warp_size;
int nwarps = gridDim.x * blockDim.x / cudf::detail::warp_size;
auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
auto const global_warp_id = global_thread_id / cudf::detail::warp_size;
auto const warp_lane = global_thread_id % cudf::detail::warp_size;
auto const nwarps = cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size;

auto const alignment_offset = reinterpret_cast<std::uintptr_t>(out_chars) % out_datatype_size;
uint4* out_chars_aligned = reinterpret_cast<uint4*>(out_chars - alignment_offset);

for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
for (auto istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
auto const out_start = out_offsets[istring];
auto const out_end = out_offsets[istring + 1];

Expand Down
6 changes: 4 additions & 2 deletions cpp/src/strings/convert/convert_urls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,8 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
auto const local_warp_id = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
auto const warp_lane = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
auto const nwarps = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
auto const nwarps =
static_cast<size_type>(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size);
char* in_chars_shared = temporary_buffer[local_warp_id];

// Loop through strings, and assign each string to a warp.
Expand Down Expand Up @@ -293,7 +294,8 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
auto const local_warp_id = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
auto const warp_lane = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
auto const nwarps = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
auto const nwarps =
static_cast<size_type>(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size);
char* in_chars_shared = temporary_buffer[local_warp_id];

// Loop through strings, and assign each string to a warp
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/strings/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel(
bitmask_type* output_mask,
size_type* out_valid_count)
{
cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
size_type warp_valid_count = 0;
auto output_index = cudf::detail::grid_1d::global_thread_id();
size_type warp_valid_count = 0;

unsigned active_mask;
if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
Expand Down Expand Up @@ -156,7 +156,7 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel(
warp_valid_count += __popc(new_word);
}

output_index += blockDim.x * gridDim.x;
output_index += cudf::detail::grid_1d::grid_stride();
if (Nullable) { active_mask = __ballot_sync(active_mask, output_index < output_size); }
}

Expand All @@ -178,7 +178,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const*
size_type const output_size,
char* output_data)
{
cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
auto output_index = cudf::detail::grid_1d::global_thread_id();

while (output_index < output_size) {
// Lookup input index by searching for output index in offsets
Expand All @@ -198,7 +198,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const*
auto const first_char = input_offsets_data[input_view.offset()];
output_data[output_index] = input_chars_data[offset_index + first_char];

output_index += blockDim.x * gridDim.x;
output_index += cudf::detail::grid_1d::grid_stride();
}
}

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/strings/regex/utilities.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ CUDF_KERNEL void for_each_kernel(ForEachFunction fn, reprog_device const d_prog,
__syncthreads();
auto const s_prog = reprog_device::load(d_prog, shmem);

auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
auto const thread_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = s_prog.thread_count();
if (thread_idx < stride) {
for (auto idx = thread_idx; idx < size; idx += stride) {
Expand Down Expand Up @@ -84,7 +84,7 @@ CUDF_KERNEL void transform_kernel(TransformFunction fn,
__syncthreads();
auto const s_prog = reprog_device::load(d_prog, shmem);

auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
auto const thread_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = s_prog.thread_count();
if (thread_idx < stride) {
for (auto idx = thread_idx; idx < size; idx += stride) {
Expand Down
16 changes: 7 additions & 9 deletions cpp/src/strings/search/find.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,10 @@ CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings,
size_type const stop,
size_type* d_results)
{
size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
auto const idx = cudf::detail::grid_1d::global_thread_id();

if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }

auto const str_idx = idx / cudf::detail::warp_size;
auto const str_idx = idx / cudf::detail::warp_size;
if (str_idx >= d_strings.size()) { return; }
auto const lane_idx = idx % cudf::detail::warp_size;

if (d_strings.is_null(str_idx)) { return; }
Expand Down Expand Up @@ -350,13 +349,12 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
string_view const d_target,
bool* d_results)
{
size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
using warp_reduce = cub::WarpReduce<bool>;
auto const idx = cudf::detail::grid_1d::global_thread_id();
using warp_reduce = cub::WarpReduce<bool>;
__shared__ typename warp_reduce::TempStorage temp_storage;

if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }

auto const str_idx = idx / cudf::detail::warp_size;
auto const str_idx = idx / cudf::detail::warp_size;
if (str_idx >= d_strings.size()) { return; }
auto const lane_idx = idx % cudf::detail::warp_size;
if (d_strings.is_null(str_idx)) { return; }
// get the string for this warp
Expand Down
9 changes: 3 additions & 6 deletions cpp/src/text/minhash.cu
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,10 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
cudf::size_type width,
hash_value_type* d_hashes)
{
auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
if (idx >= (static_cast<std::size_t>(d_strings.size()) *
static_cast<std::size_t>(cudf::detail::warp_size))) {
return;
}
auto const idx = cudf::detail::grid_1d::global_thread_id();

auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
if (str_idx >= d_strings.size()) { return; }
auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);

if (d_strings.is_null(str_idx)) { return; }
Expand Down
5 changes: 2 additions & 3 deletions cpp/src/text/subword/data_normalizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,8 @@ CUDF_KERNEL void kernel_data_normalizer(unsigned char const* strings,
constexpr uint32_t init_val = (1 << FILTER_BIT);
uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val};

cudf::thread_index_type const char_for_thread =
threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x);
uint32_t num_new_chars = 0;
auto const char_for_thread = cudf::detail::grid_1d::global_thread_id();
uint32_t num_new_chars = 0;

if (char_for_thread < total_bytes) {
auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread);
Expand Down
11 changes: 3 additions & 8 deletions cpp/src/text/subword/subword_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,10 @@ CUDF_KERNEL void kernel_compute_tensor_metadata(
uint32_t* attn_mask,
uint32_t* metadata)
{
cudf::thread_index_type const output_idx =
threadIdx.x + static_cast<cudf::thread_index_type>(blockIdx.x) *
static_cast<cudf::thread_index_type>(blockDim.x);
if (output_idx >= (static_cast<cudf::thread_index_type>(nrows_tensor_token_ids) *
static_cast<cudf::thread_index_type>(max_sequence_length))) {
return;
}
auto const output_idx = cudf::detail::grid_1d::global_thread_id();

uint32_t const absolute_row_id = output_idx / max_sequence_length;
uint32_t const absolute_row_id = output_idx / max_sequence_length;
if (absolute_row_id >= nrows_tensor_token_ids) { return; }
uint32_t const tensor_id = row2tensor[absolute_row_id];
uint32_t const row_within_tensor = row2row_within_tensor[absolute_row_id];
uint32_t const offset_token_ids_tensor = offsets[tensor_id];
Expand Down
14 changes: 4 additions & 10 deletions cpp/src/text/subword/wordpiece_tokenizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi
uint32_t* token_ids,
uint8_t* tokens_per_word)
{
cudf::thread_index_type char_for_thread = static_cast<cudf::thread_index_type>(blockDim.x) *
static_cast<cudf::thread_index_type>(blockIdx.x) +
threadIdx.x;
auto const char_for_thread = cudf::detail::grid_1d::global_thread_id();

// Deal with the start_word_indices array
if (char_for_thread < num_code_points) {
Expand Down Expand Up @@ -138,9 +136,7 @@ CUDF_KERNEL void mark_string_start_and_ends(uint32_t const* code_points,
uint32_t* end_word_indices,
uint32_t num_strings)
{
cudf::thread_index_type idx = static_cast<cudf::thread_index_type>(blockDim.x) *
static_cast<cudf::thread_index_type>(blockIdx.x) +
threadIdx.x;
auto const idx = cudf::detail::grid_1d::global_thread_id();
// Ensure the starting character of each strings is written to the word start array.
if (idx <= num_strings) {
auto const offset = strings_offsets[idx];
Expand Down Expand Up @@ -335,11 +331,9 @@ CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points,
uint32_t* token_ids,
uint8_t* tokens_per_word)
{
cudf::thread_index_type word_to_tokenize = static_cast<cudf::thread_index_type>(blockDim.x) *
static_cast<cudf::thread_index_type>(blockIdx.x) +
threadIdx.x;
auto const word_to_tokenize = cudf::detail::grid_1d::global_thread_id();

if (word_to_tokenize >= total_words) return;
if (word_to_tokenize >= total_words) { return; }
// Each thread gets the start code_point offset for each word and resets the token_id memory to
// the default value. In a post processing step, all of these values will be removed.
auto const token_start = word_starts[word_to_tokenize];
Expand Down
9 changes: 3 additions & 6 deletions cpp/src/text/vocabulary_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -222,12 +222,9 @@ CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings,
int8_t* d_results)
{
// string per warp
auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
if (idx >= (static_cast<std::size_t>(d_strings.size()) *
static_cast<std::size_t>(cudf::detail::warp_size))) {
return;
}
auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
auto const idx = cudf::detail::grid_1d::global_thread_id();
auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
if (str_idx >= d_strings.size()) { return; }
auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);

if (d_strings.is_null(str_idx)) {
Expand Down
Loading