From 29c87a2af9047f4ee9272abe2329fea83bd32f63 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 17 Aug 2023 13:18:39 -0700 Subject: [PATCH 1/4] Use cudf::thread_index_type in replace.cu. --- cpp/src/replace/replace.cu | 78 ++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index a7847bc0e7f..6ad35226417 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -126,41 +126,42 @@ __global__ void replace_strings_first_pass(cudf::column_device_view input, cudf::bitmask_type* output_valid, cudf::size_type* __restrict__ output_valid_count) { - cudf::size_type nrows = input.size(); - cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x; - uint32_t active_mask = 0xffff'ffffu; - active_mask = __ballot_sync(active_mask, i < nrows); + cudf::size_type nrows = input.size(); + cudf::thread_index_type tid = blockIdx.x * blockDim.x + threadIdx.x; + uint32_t active_mask = 0xffff'ffffu; + active_mask = __ballot_sync(active_mask, tid < nrows); auto const lane_id{threadIdx.x % cudf::detail::warp_size}; uint32_t valid_sum{0}; - while (i < nrows) { + while (tid < nrows) { + auto const idx = static_cast(tid); bool input_is_valid = true; - if (input_has_nulls) input_is_valid = input.is_valid_nocheck(i); + if (input_has_nulls) input_is_valid = input.is_valid_nocheck(idx); bool output_is_valid = input_is_valid; if (input_is_valid) { - int result = get_new_string_value(i, input, values_to_replace, replacement); - cudf::string_view output = (result == -1) ? input.element(i) + int result = get_new_string_value(idx, input, values_to_replace, replacement); + cudf::string_view output = (result == -1) ? input.element(idx) : replacement.element(result); - offsets.data()[i] = output.size_bytes(); - indices.data()[i] = result; + offsets.data()[idx] = output.size_bytes(); + indices.data()[idx] = result; if (replacement_has_nulls && result != -1) { output_is_valid = replacement.is_valid_nocheck(result); } } else { - offsets.data()[i] = 0; - indices.data()[i] = -1; + offsets.data()[idx] = 0; + indices.data()[idx] = -1; } uint32_t bitmask = __ballot_sync(active_mask, output_is_valid); if (0 == lane_id) { - output_valid[cudf::word_index(i)] = bitmask; + output_valid[cudf::word_index(idx)] = bitmask; valid_sum += __popc(bitmask); } - i += blockDim.x * gridDim.x; - active_mask = __ballot_sync(active_mask, i < nrows); + tid += blockDim.x * gridDim.x; + active_mask = __ballot_sync(active_mask, tid < nrows); } // Compute total valid count for this block and add it to global count @@ -188,28 +189,32 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input, cudf::mutable_column_device_view strings, cudf::mutable_column_device_view indices) { - cudf::size_type nrows = input.size(); - cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x; + cudf::size_type nrows = input.size(); + cudf::thread_index_type tid = blockIdx.x * blockDim.x + threadIdx.x; - while (i < nrows) { - bool output_is_valid = true; - bool input_is_valid = true; - cudf::size_type idx = indices.element(i); + while (tid < nrows) { + auto const idx = static_cast(tid); + auto const replace_idx = indices.element(idx); + bool output_is_valid = true; + bool input_is_valid = true; if (input_has_nulls) { - input_is_valid = input.is_valid_nocheck(i); + input_is_valid = input.is_valid_nocheck(idx); output_is_valid = input_is_valid; } - if (replacement_has_nulls && idx != -1) { output_is_valid = replacement.is_valid_nocheck(idx); } + if (replacement_has_nulls && replace_idx != -1) { + output_is_valid = replacement.is_valid_nocheck(replace_idx); + } if (output_is_valid) { - cudf::string_view output = (idx == -1) ? input.element(i) - : replacement.element(idx); - std::memcpy(strings.data() + offsets.data()[i], + cudf::string_view output = (replace_idx == -1) + ? input.element(idx) + : replacement.element(replace_idx); + std::memcpy(strings.data() + offsets.data()[idx], output.data(), output.size_bytes()); } - i += blockDim.x * gridDim.x; + tid += blockDim.x * gridDim.x; } } @@ -247,23 +252,24 @@ __global__ void replace_kernel(cudf::column_device_view input, { T* __restrict__ output_data = output.data(); - cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x; + cudf::thread_index_type tid = blockIdx.x * blockDim.x + threadIdx.x; uint32_t active_mask = 0xffff'ffffu; - active_mask = __ballot_sync(active_mask, i < nrows); + active_mask = __ballot_sync(active_mask, tid < nrows); auto const lane_id{threadIdx.x % cudf::detail::warp_size}; uint32_t valid_sum{0}; - while (i < nrows) { + while (tid < nrows) { + auto const idx = static_cast(tid); bool output_is_valid{true}; bool input_is_valid{true}; if (input_has_nulls) { - input_is_valid = input.is_valid_nocheck(i); + input_is_valid = input.is_valid_nocheck(idx); output_is_valid = input_is_valid; } if (input_is_valid) - thrust::tie(output_data[i], output_is_valid) = get_new_value( - i, + thrust::tie(output_data[idx], output_is_valid) = get_new_value( + idx, input.data(), values_to_replace.data(), values_to_replace.data() + values_to_replace.size(), @@ -274,13 +280,13 @@ __global__ void replace_kernel(cudf::column_device_view input, if (input_has_nulls or replacement_has_nulls) { uint32_t bitmask = __ballot_sync(active_mask, output_is_valid); if (0 == lane_id) { - output.set_mask_word(cudf::word_index(i), bitmask); + output.set_mask_word(cudf::word_index(idx), bitmask); valid_sum += __popc(bitmask); } } - i += blockDim.x * gridDim.x; - active_mask = __ballot_sync(active_mask, i < nrows); + tid += blockDim.x * gridDim.x; + active_mask = __ballot_sync(active_mask, tid < nrows); } if (input_has_nulls or replacement_has_nulls) { // Compute total valid count for this block and add it to global count From 55c535c76d9e93d74d4ff39a241c511c5225f2e2 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 17 Aug 2023 14:22:46 -0700 Subject: [PATCH 2/4] Use size_type. --- cpp/src/replace/replace.cu | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 6ad35226417..1313316bd85 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -390,10 +390,16 @@ std::unique_ptr replace_kernel_forwarder::operator() sizes = cudf::make_numeric_column( - cudf::data_type(cudf::type_id::INT32), input_col.size(), cudf::mask_state::UNALLOCATED, stream); - std::unique_ptr indices = cudf::make_numeric_column( - cudf::data_type(cudf::type_id::INT32), input_col.size(), cudf::mask_state::UNALLOCATED, stream); + std::unique_ptr sizes = + cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, + input_col.size(), + cudf::mask_state::UNALLOCATED, + stream); + std::unique_ptr indices = + cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, + input_col.size(), + cudf::mask_state::UNALLOCATED, + stream); auto sizes_view = sizes->mutable_view(); auto indices_view = indices->mutable_view(); @@ -419,7 +425,7 @@ std::unique_ptr replace_kernel_forwarder::operator()(), sizes_view.end(), stream, mr); + sizes_view.begin(), sizes_view.end(), stream, mr); auto offsets_view = offsets->mutable_view(); auto device_offsets = cudf::mutable_column_device_view::create(offsets_view, stream); From 439cd899f1bf73f9a437c724dad52443910403e8 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 24 Aug 2023 16:39:38 -0500 Subject: [PATCH 3/4] Cast to thread_index_type in more places. --- cpp/src/replace/replace.cu | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 1313316bd85..84e26486030 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -126,10 +126,11 @@ __global__ void replace_strings_first_pass(cudf::column_device_view input, cudf::bitmask_type* output_valid, cudf::size_type* __restrict__ output_valid_count) { - cudf::size_type nrows = input.size(); - cudf::thread_index_type tid = blockIdx.x * blockDim.x + threadIdx.x; - uint32_t active_mask = 0xffff'ffffu; - active_mask = __ballot_sync(active_mask, tid < nrows); + cudf::size_type nrows = input.size(); + auto tid = cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x} + + cudf::thread_index_type{threadIdx.x}; + uint32_t active_mask = 0xffff'ffffu; + active_mask = __ballot_sync(active_mask, tid < nrows); auto const lane_id{threadIdx.x % cudf::detail::warp_size}; uint32_t valid_sum{0}; @@ -160,7 +161,7 @@ __global__ void replace_strings_first_pass(cudf::column_device_view input, valid_sum += __popc(bitmask); } - tid += blockDim.x * gridDim.x; + tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}; active_mask = __ballot_sync(active_mask, tid < nrows); } @@ -189,8 +190,9 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input, cudf::mutable_column_device_view strings, cudf::mutable_column_device_view indices) { - cudf::size_type nrows = input.size(); - cudf::thread_index_type tid = blockIdx.x * blockDim.x + threadIdx.x; + cudf::size_type nrows = input.size(); + auto tid = cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x} + + cudf::thread_index_type{threadIdx.x}; while (tid < nrows) { auto const idx = static_cast(tid); @@ -214,7 +216,7 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input, output.size_bytes()); } - tid += blockDim.x * gridDim.x; + tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}; } } @@ -252,7 +254,8 @@ __global__ void replace_kernel(cudf::column_device_view input, { T* __restrict__ output_data = output.data(); - cudf::thread_index_type tid = blockIdx.x * blockDim.x + threadIdx.x; + auto tid = cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x} + + cudf::thread_index_type{threadIdx.x}; uint32_t active_mask = 0xffff'ffffu; active_mask = __ballot_sync(active_mask, tid < nrows); @@ -285,7 +288,7 @@ __global__ void replace_kernel(cudf::column_device_view input, } } - tid += blockDim.x * gridDim.x; + tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}; active_mask = __ballot_sync(active_mask, tid < nrows); } if (input_has_nulls or replacement_has_nulls) { From 464d29cf341d4650dfa126a0f9712d0fd0f74aea Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 1 Sep 2023 16:33:18 -0500 Subject: [PATCH 4/4] Use grid_1d utilities. --- cpp/src/replace/replace.cu | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 84e26486030..fe36cc4cf8a 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -127,10 +127,10 @@ __global__ void replace_strings_first_pass(cudf::column_device_view input, cudf::size_type* __restrict__ output_valid_count) { cudf::size_type nrows = input.size(); - auto tid = cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x} + - cudf::thread_index_type{threadIdx.x}; - uint32_t active_mask = 0xffff'ffffu; - active_mask = __ballot_sync(active_mask, tid < nrows); + auto tid = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + uint32_t active_mask = 0xffff'ffffu; + active_mask = __ballot_sync(active_mask, tid < nrows); auto const lane_id{threadIdx.x % cudf::detail::warp_size}; uint32_t valid_sum{0}; @@ -161,7 +161,7 @@ __global__ void replace_strings_first_pass(cudf::column_device_view input, valid_sum += __popc(bitmask); } - tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}; + tid += stride; active_mask = __ballot_sync(active_mask, tid < nrows); } @@ -191,8 +191,8 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input, cudf::mutable_column_device_view indices) { cudf::size_type nrows = input.size(); - auto tid = cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x} + - cudf::thread_index_type{threadIdx.x}; + auto tid = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); while (tid < nrows) { auto const idx = static_cast(tid); @@ -216,7 +216,7 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input, output.size_bytes()); } - tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}; + tid += stride; } } @@ -254,8 +254,8 @@ __global__ void replace_kernel(cudf::column_device_view input, { T* __restrict__ output_data = output.data(); - auto tid = cudf::thread_index_type{blockIdx.x} * cudf::thread_index_type{blockDim.x} + - cudf::thread_index_type{threadIdx.x}; + auto tid = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); uint32_t active_mask = 0xffff'ffffu; active_mask = __ballot_sync(active_mask, tid < nrows); @@ -288,7 +288,7 @@ __global__ void replace_kernel(cudf::column_device_view input, } } - tid += cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}; + tid += stride; active_mask = __ballot_sync(active_mask, tid < nrows); } if (input_has_nulls or replacement_has_nulls) {