From 4a4fc685137a3f0f199be6ae2b9497d3093b768e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 21 Mar 2024 10:14:17 -0400 Subject: [PATCH 01/10] Add experimental make-strings-children utility --- .../strings/detail/strings_children_ex.cuh | 122 ++++++++++++++++++ cpp/src/strings/combine/concatenate.cu | 17 ++- cpp/src/strings/repeat_strings.cu | 28 ++-- 3 files changed, 146 insertions(+), 21 deletions(-) create mode 100644 cpp/include/cudf/strings/detail/strings_children_ex.cuh diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh new file mode 100644 index 00000000000..f4f728e15fc --- /dev/null +++ b/cpp/include/cudf/strings/detail/strings_children_ex.cuh @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace experimental { + +/** + * @brief Creates child offsets and chars data by applying the template function that + * can be used for computing the output size of each string as well as create the output + * + * @tparam SizeAndExecuteFunction Function must accept an index. + * It must also have members d_sizes, d_offsets and d_chars which are set to + * memory containing the offsets and chars columns during write. + * + * @param size_and_exec_fn This is called twice. Once for the output size of each string + * and once again to fill in the memory pointed to by d_chars. + * @param exec_size Number of rows for executing the `size_and_exec_fn` function. + * @param strings_count Number of strings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned columns' device memory + * @return Offsets child column and chars data for a strings column + */ +template +auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, + size_type exec_size, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto output_sizes = rmm::device_uvector(strings_count, stream); + size_and_exec_fn.d_sizes = output_sizes.data(); + + // This is called twice -- once for offsets and once for chars. + // Reducing the number of places size_and_exec_fn is inlined speeds up compile time. + auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) { + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + exec_size, + size_and_exec_fn); + }; + + // Compute the output sizes + for_each_fn(size_and_exec_fn); + + // Convert the sizes to offsets + auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( + output_sizes.begin(), output_sizes.end(), stream, mr); + size_and_exec_fn.d_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); + + // Now build the chars column + rmm::device_uvector chars(bytes, stream, mr); + + // Execute the function fn again to fill the chars column. + // Note that if the output chars column has zero size, the function fn should not be called to + // avoid accidentally overwriting the offsets. + if (bytes > 0) { + size_and_exec_fn.d_chars = chars.data(); + for_each_fn(size_and_exec_fn); + } + + return std::pair(std::move(offsets_column), std::move(chars)); +} + +/** + * @brief Creates child offsets and chars columns by applying the template function that + * can be used for computing the output size of each string as well as create the output. + * + * @tparam SizeAndExecuteFunction Function must accept an index and return a size. + * It must also have members d_offsets and d_chars which are set to + * memory containing the offsets and chars columns during write. + * + * @param size_and_exec_fn This is called twice. Once for the output size of each string + * and once again to fill in the memory pointed to by d_chars. + * @param strings_count Number of strings. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned columns' device memory. + * @return offsets child column and chars child column for a strings column + */ +template +auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr); +} + +} // namespace experimental +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index 14f530971f5..bc5ea693bb8 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -17,11 +17,12 @@ #include #include #include +#include #include #include #include #include -#include +#include #include #include #include @@ -48,8 +49,9 @@ struct concat_strings_base { table_device_view const d_table; string_scalar_device_view const d_narep; separator_on_nulls separate_nulls; - size_type* d_offsets{}; + size_type* d_sizes{}; char* d_chars{}; + cudf::detail::input_offsetalator d_offsets{}; /** * @brief Concatenate each table row to a single output string. @@ -67,7 +69,7 @@ struct concat_strings_base { thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { return col.is_null(idx); })) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) d_sizes[idx] = 0; return; } @@ -94,7 +96,7 @@ struct concat_strings_base { write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } - if (!d_chars) d_offsets[idx] = bytes; + if (!d_chars) d_sizes[idx] = bytes; } }; @@ -142,7 +144,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls}; - auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr); + auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr); // create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( @@ -187,7 +189,7 @@ struct multi_separator_concat_fn : concat_strings_base { __device__ void operator()(size_type idx) { if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) d_sizes[idx] = 0; return; } @@ -234,7 +236,8 @@ std::unique_ptr concatenate(table_view const& strings_columns, multi_separator_concat_fn mscf{ *d_table, separator_col_view, separator_rep, col_rep, separate_nulls}; - auto [offsets_column, chars] = make_strings_children(mscf, strings_count, stream, mr); + auto [offsets_column, chars] = + experimental::make_strings_children(mscf, strings_count, stream, mr); // Create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index 690a72c098f..f80a21fdc40 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -20,7 +20,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -106,12 +107,11 @@ struct compute_size_and_repeat_fn { column_device_view const strings_dv; size_type const repeat_times; bool const has_nulls; - - size_type* d_offsets{nullptr}; - + size_type* d_sizes{}; // If d_chars == nullptr: only compute sizes of the output strings. // If d_chars != nullptr: only repeat strings. - char* d_chars{nullptr}; + char* d_chars{}; + cudf::detail::input_offsetalator d_offsets{}; // `idx` will be in the range of [0, repeat_times * strings_count). __device__ void operator()(size_type const idx) const noexcept @@ -121,7 +121,7 @@ struct compute_size_and_repeat_fn { auto const is_valid = !has_nulls || strings_dv.is_valid_nocheck(str_idx); if (!d_chars && repeat_idx == 0) { - d_offsets[str_idx] = + d_sizes[str_idx] = is_valid ? repeat_times * strings_dv.element(str_idx).size_bytes() : 0; } @@ -160,8 +160,8 @@ std::unique_ptr repeat_strings(strings_column_view const& input, auto const strings_dv_ptr = column_device_view::create(input.parent(), stream); auto const fn = compute_size_and_repeat_fn{*strings_dv_ptr, repeat_times, input.has_nulls()}; - auto [offsets_column, chars] = - make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr); + auto [offsets_column, chars] = experimental::make_strings_children( + fn, strings_count * repeat_times, strings_count, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), chars.release(), @@ -182,11 +182,11 @@ struct compute_sizes_and_repeat_fn { bool const strings_has_nulls; bool const rtimes_has_nulls; - size_type* d_offsets{nullptr}; - + size_type* d_sizes{nullptr}; // If d_chars == nullptr: only compute sizes of the output strings. // If d_chars != nullptr: only repeat strings. - char* d_chars{nullptr}; + char* d_chars{}; + cudf::detail::input_offsetalator d_offsets{}; __device__ void operator()(size_type const idx) const noexcept { @@ -196,7 +196,7 @@ struct compute_sizes_and_repeat_fn { // Any null input (either string or repeat_times value) will result in a null output. auto const is_valid = string_is_valid && rtimes_is_valid; if (!is_valid) { - if (!d_chars) { d_offsets[idx] = 0; } + if (!d_chars) { d_sizes[idx] = 0; } return; } @@ -205,7 +205,7 @@ struct compute_sizes_and_repeat_fn { if (!d_chars) { // repeat_times could be negative - d_offsets[idx] = (repeat_times > 0) ? (repeat_times * d_str.size_bytes()) : 0; + d_sizes[idx] = (repeat_times > 0) ? (repeat_times * d_str.size_bytes()) : 0; } else { auto output_ptr = d_chars + d_offsets[idx]; while (repeat_times-- > 0) { @@ -240,7 +240,7 @@ std::unique_ptr repeat_strings(strings_column_view const& input, input.has_nulls(), repeat_times.has_nulls()}; - auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr); + auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr); // We generate new bitmask by AND of the two input columns' bitmasks. // Note that if either of the input columns are nullable, the output column will also be nullable From 7dc2f24be0a4ecbd98e56b067c4e5d9d14c54508 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 10 Apr 2024 10:14:37 -0400 Subject: [PATCH 02/10] fix doxygen and other comments --- .../strings/detail/strings_children_ex.cuh | 32 +++++++++---------- cpp/src/strings/combine/concatenate.cu | 6 ++-- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh index f4f728e15fc..08500420ec6 100644 --- a/cpp/include/cudf/strings/detail/strings_children_ex.cuh +++ b/cpp/include/cudf/strings/detail/strings_children_ex.cuh @@ -38,17 +38,18 @@ namespace experimental { * @brief Creates child offsets and chars data by applying the template function that * can be used for computing the output size of each string as well as create the output * - * @tparam SizeAndExecuteFunction Function must accept an index. - * It must also have members d_sizes, d_offsets and d_chars which are set to - * memory containing the offsets and chars columns during write. + * @tparam SizeAndExecuteFunction Function must accept a row index. + * It must also have member `d_sizes` to hold computed row output sizes on the 1st pass + * and members `d_offsets` and `d_chars` for the 2nd pass to resolve the output memory + * location for each row. * * @param size_and_exec_fn This is called twice. Once for the output size of each string * and once again to fill in the memory pointed to by d_chars. - * @param exec_size Number of rows for executing the `size_and_exec_fn` function. + * @param exec_size Number of rows for executing the `size_and_exec_fn` function * @param strings_count Number of strings * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned columns' device memory - * @return Offsets child column and chars data for a strings column + * @return Offsets child column and chars vector for creating a strings column */ template auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, @@ -60,7 +61,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, auto output_sizes = rmm::device_uvector(strings_count, stream); size_and_exec_fn.d_sizes = output_sizes.data(); - // This is called twice -- once for offsets and once for chars. + // This is called twice -- once for computing sizes and once for writing chars. // Reducing the number of places size_and_exec_fn is inlined speeds up compile time. auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) { thrust::for_each_n(rmm::exec_policy(stream), @@ -81,9 +82,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, // Now build the chars column rmm::device_uvector chars(bytes, stream, mr); - // Execute the function fn again to fill the chars column. - // Note that if the output chars column has zero size, the function fn should not be called to - // avoid accidentally overwriting the offsets. + // Execute the function fn again to fill in the chars data. if (bytes > 0) { size_and_exec_fn.d_chars = chars.data(); for_each_fn(size_and_exec_fn); @@ -96,16 +95,17 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, * @brief Creates child offsets and chars columns by applying the template function that * can be used for computing the output size of each string as well as create the output. * - * @tparam SizeAndExecuteFunction Function must accept an index and return a size. - * It must also have members d_offsets and d_chars which are set to - * memory containing the offsets and chars columns during write. + * @tparam SizeAndExecuteFunction Function must accept a row index. + * It must also have member `d_sizes` to hold computed row output sizes on the 1st pass + * and members `d_offsets` and `d_chars` for the 2nd pass to resolve the output memory + * location for each row. * * @param size_and_exec_fn This is called twice. Once for the output size of each string * and once again to fill in the memory pointed to by d_chars. - * @param strings_count Number of strings. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned columns' device memory. - * @return offsets child column and chars child column for a strings column + * @param strings_count Number of strings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned columns' device memory + * @return Offsets child column and chars vector for creating a strings column */ template auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index bc5ea693bb8..c9900fb0db4 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -69,7 +69,7 @@ struct concat_strings_base { thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { return col.is_null(idx); })) { - if (!d_chars) d_sizes[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } @@ -96,7 +96,7 @@ struct concat_strings_base { write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } - if (!d_chars) d_sizes[idx] = bytes; + if (!d_chars) { d_sizes[idx] = bytes; } } }; @@ -189,7 +189,7 @@ struct multi_separator_concat_fn : concat_strings_base { __device__ void operator()(size_type idx) { if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) { - if (!d_chars) d_sizes[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } From 55eed6865c2a8872e3e59cbbe243ea1a467f2a05 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 10 Apr 2024 11:43:25 -0400 Subject: [PATCH 03/10] change rows to threads in doxygen --- cpp/include/cudf/strings/detail/strings_children_ex.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh index 08500420ec6..5c85295bbeb 100644 --- a/cpp/include/cudf/strings/detail/strings_children_ex.cuh +++ b/cpp/include/cudf/strings/detail/strings_children_ex.cuh @@ -45,7 +45,7 @@ namespace experimental { * * @param size_and_exec_fn This is called twice. Once for the output size of each string * and once again to fill in the memory pointed to by d_chars. - * @param exec_size Number of rows for executing the `size_and_exec_fn` function + * @param exec_size Number of threads for executing the `size_and_exec_fn` function * @param strings_count Number of strings * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned columns' device memory From d3d297da1e0880ee35116b43620ff0f0a37c7092 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 12 Apr 2024 10:03:19 -0400 Subject: [PATCH 04/10] add example to exp doxygen --- .../strings/detail/strings_children_ex.cuh | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh index 5c85295bbeb..fa3ae965dad 100644 --- a/cpp/include/cudf/strings/detail/strings_children_ex.cuh +++ b/cpp/include/cudf/strings/detail/strings_children_ex.cuh @@ -38,6 +38,24 @@ namespace experimental { * @brief Creates child offsets and chars data by applying the template function that * can be used for computing the output size of each string as well as create the output * + * The `size_and_exec_fn` is expected to be functor with 3 settable member variables. + * @code{.cpp} + * struct size_and_exec_fn { + * size_type* d_sizes; + * char* d_chars{}; + * cudf::detail::input_offsetalator d_offsets; + * __device__ void operator()(size_type thread_idx) { + * // functor-specific logic to resolve out_idx from thread_idx + * if( !d_chars ) { + * d_sizes[out_idx] = output_size; + * } else { + * auto d_output = d_chars + d_offsets[out_idx]; + * // write characters to d_output + * } + * } + * }; + * @endcode + * * @tparam SizeAndExecuteFunction Function must accept a row index. * It must also have member `d_sizes` to hold computed row output sizes on the 1st pass * and members `d_offsets` and `d_chars` for the 2nd pass to resolve the output memory @@ -93,7 +111,24 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, /** * @brief Creates child offsets and chars columns by applying the template function that - * can be used for computing the output size of each string as well as create the output. + * can be used for computing the output size of each string as well as create the output + * + * The `size_and_exec_fn` is expected to be functor with 3 settable member variables. + * @code{.cpp} + * struct size_and_exec_fn { + * size_type* d_sizes; + * char* d_chars{}; + * cudf::detail::input_offsetalator d_offsets; + * __device__ void operator()(size_type idx) { + * if( !d_chars ) { + * d_sizes[idx] = output_size; + * } else { + * auto d_output = d_chars + d_offsets[idx]; + * // write characters to d_output + * } + * } + * }; + * @endcode * * @tparam SizeAndExecuteFunction Function must accept a row index. * It must also have member `d_sizes` to hold computed row output sizes on the 1st pass From b48a24897fc22bb67c0afece6142ed02449718aa Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 17 Apr 2024 10:27:37 -0400 Subject: [PATCH 05/10] fix mr parameter type for new exp utility --- cpp/include/cudf/strings/detail/strings_children_ex.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh index fa3ae965dad..d77981999a7 100644 --- a/cpp/include/cudf/strings/detail/strings_children_ex.cuh +++ b/cpp/include/cudf/strings/detail/strings_children_ex.cuh @@ -74,7 +74,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, size_type exec_size, size_type strings_count, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { auto output_sizes = rmm::device_uvector(strings_count, stream); size_and_exec_fn.d_sizes = output_sizes.data(); @@ -146,7 +146,7 @@ template auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, size_type strings_count, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr); } From 85745ccf202fbdb2eeed559f07a88494cd62497c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 18 Apr 2024 08:37:03 -0400 Subject: [PATCH 06/10] fix includes for strings_children_ex.cuh --- cpp/include/cudf/strings/detail/strings_children_ex.cuh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh index d77981999a7..867356e2e63 100644 --- a/cpp/include/cudf/strings/detail/strings_children_ex.cuh +++ b/cpp/include/cudf/strings/detail/strings_children_ex.cuh @@ -18,17 +18,14 @@ #include #include #include -#include -#include #include +#include #include #include #include -#include - namespace cudf { namespace strings { namespace detail { From 06069e9930379b196a4639cb34026aa0b2f4f735 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 18 Apr 2024 12:14:22 -0400 Subject: [PATCH 07/10] change ternary to std::max() --- cpp/src/strings/repeat_strings.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index c422e318d5e..e216443ee83 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -206,7 +206,7 @@ struct compute_sizes_and_repeat_fn { if (!d_chars) { // repeat_times could be negative - d_sizes[idx] = (repeat_times > 0) ? (repeat_times * d_str.size_bytes()) : 0; + d_sizes[idx] = std::max(repeat_times, 0) * d_str.size_bytes(); } else { auto output_ptr = d_chars + d_offsets[idx]; while (repeat_times-- > 0) { From 17228b88ffe0fd122fa30f732ca1761698a66cc8 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 19 Apr 2024 10:23:00 -0400 Subject: [PATCH 08/10] rework functor calling interface --- .../strings/detail/strings_children_ex.cuh | 112 +++++++++++------- cpp/src/strings/combine/concatenate.cu | 24 ++-- cpp/src/strings/repeat_strings.cu | 40 ++++--- 3 files changed, 109 insertions(+), 67 deletions(-) diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh index 867356e2e63..8361a9d5146 100644 --- a/cpp/include/cudf/strings/detail/strings_children_ex.cuh +++ b/cpp/include/cudf/strings/detail/strings_children_ex.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -31,32 +32,55 @@ namespace strings { namespace detail { namespace experimental { +/** + * @brief Kernel used by make_strings_children for calling the given functor + * + * @tparam SizeAndExecuteFunction Functor type to call in each thread + * + * @param fn Functor to call in each thread + * @param exec_size Total number of threads to be processed by this kernel + * @param d_sizes Output sizes array passed to the functor + * @param d_chars Output char buffer passed to the functor + * @param d_offsets Offsets to address specific sections of d_chars + */ +template +CUDF_KERNEL __global__ void strings_children_kernel(SizeAndExecuteFunction fn, + size_type exec_size, + size_type* d_sizes, + char* d_chars, + cudf::detail::input_offsetalator d_offsets) +{ + auto tid = cudf::detail::grid_1d::global_thread_id(); + if (tid < exec_size) { fn(tid, d_sizes, d_chars, d_offsets); } +} + /** * @brief Creates child offsets and chars data by applying the template function that * can be used for computing the output size of each string as well as create the output * - * The `size_and_exec_fn` is expected to be functor with 3 settable member variables. + * The `size_and_exec_fn` is expected declare an operator() function with 4 parameters. * @code{.cpp} * struct size_and_exec_fn { - * size_type* d_sizes; - * char* d_chars{}; - * cudf::detail::input_offsetalator d_offsets; - * __device__ void operator()(size_type thread_idx) { - * // functor-specific logic to resolve out_idx from thread_idx - * if( !d_chars ) { - * d_sizes[out_idx] = output_size; - * } else { - * auto d_output = d_chars + d_offsets[out_idx]; - * // write characters to d_output - * } + * __device__ void operator()(size_type thread_idx, + * size_type* d_sizes, + * char* d_chars, + * input_offsetalator d_offsets) + * { + * // functor-specific logic to resolve out_idx from thread_idx + * if( !d_chars ) { + * d_sizes[out_idx] = output_size; + * } else { + * auto d_output = d_chars + d_offsets[out_idx]; + * // write characters to d_output + * } * } * }; * @endcode * - * @tparam SizeAndExecuteFunction Function must accept a row index. - * It must also have member `d_sizes` to hold computed row output sizes on the 1st pass - * and members `d_offsets` and `d_chars` for the 2nd pass to resolve the output memory - * location for each row. + * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting 4 parameters: + * `size_type` index of the current thread, `size_type*` to hold computed row + * output sizes on the 1st pass and `char*` and `input_offsetalator` + * for the 2nd pass to resolve the output memory location for each row. * * @param size_and_exec_fn This is called twice. Once for the output size of each string * and once again to fill in the memory pointed to by d_chars. @@ -73,35 +97,32 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto output_sizes = rmm::device_uvector(strings_count, stream); - size_and_exec_fn.d_sizes = output_sizes.data(); - // This is called twice -- once for computing sizes and once for writing chars. // Reducing the number of places size_and_exec_fn is inlined speeds up compile time. - auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) { - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - exec_size, - size_and_exec_fn); + auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn, + size_type* d_sizes, + char* d_chars, + cudf::detail::input_offsetalator d_offsets) { + auto constexpr block_size = 256; + auto grid = cudf::detail::grid_1d{exec_size, block_size}; + strings_children_kernel<<>>( + size_and_exec_fn, exec_size, d_sizes, d_chars, d_offsets); }; // Compute the output sizes - for_each_fn(size_and_exec_fn); + auto output_sizes = rmm::device_uvector(strings_count, stream); + for_each_fn(size_and_exec_fn, output_sizes.data(), nullptr, cudf::detail::input_offsetalator{}); // Convert the sizes to offsets auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( output_sizes.begin(), output_sizes.end(), stream, mr); - size_and_exec_fn.d_offsets = - cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); + auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); // Now build the chars column rmm::device_uvector chars(bytes, stream, mr); // Execute the function fn again to fill in the chars data. - if (bytes > 0) { - size_and_exec_fn.d_chars = chars.data(); - for_each_fn(size_and_exec_fn); - } + if (bytes > 0) { for_each_fn(size_and_exec_fn, output_sizes.data(), chars.data(), d_offsets); } return std::pair(std::move(offsets_column), std::move(chars)); } @@ -113,24 +134,25 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, * The `size_and_exec_fn` is expected to be functor with 3 settable member variables. * @code{.cpp} * struct size_and_exec_fn { - * size_type* d_sizes; - * char* d_chars{}; - * cudf::detail::input_offsetalator d_offsets; - * __device__ void operator()(size_type idx) { - * if( !d_chars ) { - * d_sizes[idx] = output_size; - * } else { - * auto d_output = d_chars + d_offsets[idx]; - * // write characters to d_output - * } + * __device__ void operator()(size_type idx, + * size_type* d_sizes, + * char* d_chars, + * input_offsetalator d_offsets) + * { + * if( !d_chars ) { + * d_sizes[idx] = output_size; + * } else { + * auto d_output = d_chars + d_offsets[idx]; + * // write characters to d_output + * } * } * }; * @endcode * - * @tparam SizeAndExecuteFunction Function must accept a row index. - * It must also have member `d_sizes` to hold computed row output sizes on the 1st pass - * and members `d_offsets` and `d_chars` for the 2nd pass to resolve the output memory - * location for each row. + * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting 4 parameters: + * `size_type` index of the current thread, `size_type*` to hold computed row + * output sizes on the 1st pass and `char*` and `input_offsetalator` + * for the 2nd pass to resolve the output memory location for each row. * * @param size_and_exec_fn This is called twice. Once for the output size of each string * and once again to fill in the memory pointed to by d_chars. diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index 39cf1cf7635..f81e58c55ca 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -50,9 +50,6 @@ struct concat_strings_base { table_device_view const d_table; string_scalar_device_view const d_narep; separator_on_nulls separate_nulls; - size_type* d_sizes{}; - char* d_chars{}; - cudf::detail::input_offsetalator d_offsets{}; /** * @brief Concatenate each table row to a single output string. @@ -64,7 +61,11 @@ struct concat_strings_base { * @param idx The current row to process * @param d_separator String to place in between each column's row */ - __device__ void process_row(size_type idx, string_view const d_separator) + __device__ void process_row(size_type idx, + string_view const d_separator, + size_type* d_sizes, + char* d_chars, + cudf::detail::input_offsetalator d_offsets) { if (!d_narep.is_valid() && thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { @@ -115,7 +116,13 @@ struct concat_strings_fn : concat_strings_base { { } - __device__ void operator()(size_type idx) { process_row(idx, d_separator); } + __device__ void operator()(std::size_t idx, + size_type* d_sizes, + char* d_chars, + cudf::detail::input_offsetalator d_offsets) + { + process_row(idx, d_separator, d_sizes, d_chars, d_offsets); + } }; } // namespace @@ -187,7 +194,10 @@ struct multi_separator_concat_fn : concat_strings_base { { } - __device__ void operator()(size_type idx) + __device__ void operator()(size_type idx, + size_type* d_sizes, + char* d_chars, + cudf::detail::input_offsetalator d_offsets) { if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) { if (!d_chars) { d_sizes[idx] = 0; } @@ -197,7 +207,7 @@ struct multi_separator_concat_fn : concat_strings_base { auto const d_separator = d_separators.is_valid(idx) ? d_separators.element(idx) : d_separator_narep.value(); // base class utility function handles the rest - process_row(idx, d_separator); + process_row(idx, d_separator, d_sizes, d_chars, d_offsets); } }; diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index e216443ee83..b011d8d3ff9 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -108,14 +108,19 @@ struct compute_size_and_repeat_fn { column_device_view const strings_dv; size_type const repeat_times; bool const has_nulls; - size_type* d_sizes{}; - // If d_chars == nullptr: only compute sizes of the output strings. - // If d_chars != nullptr: only repeat strings. - char* d_chars{}; - cudf::detail::input_offsetalator d_offsets{}; - - // `idx` will be in the range of [0, repeat_times * strings_count). - __device__ void operator()(size_type const idx) const noexcept + + /** + * @brief Called by make_strings_children to build output + * + * @param idx Thread index in the range [0,repeat_times * strings_count) + * @param d_sizes Return output size here in 1st call (d_chars==nullptr) + * @param d_chars Write output here in 2nd call + * @param d_offsets Offsets to address output row within d_chars + */ + __device__ void operator()(size_type idx, + size_type* d_sizes, + char* d_chars, + cudf::detail::input_offsetalator d_offsets) const noexcept { auto const str_idx = idx / repeat_times; // value cycles in [0, string_count) auto const repeat_idx = idx % repeat_times; // value cycles in [0, repeat_times) @@ -183,13 +188,18 @@ struct compute_sizes_and_repeat_fn { bool const strings_has_nulls; bool const rtimes_has_nulls; - size_type* d_sizes{nullptr}; - // If d_chars == nullptr: only compute sizes of the output strings. - // If d_chars != nullptr: only repeat strings. - char* d_chars{}; - cudf::detail::input_offsetalator d_offsets{}; - - __device__ void operator()(size_type const idx) const noexcept + /** + * @brief Called by make_strings_children to build output + * + * @param idx Row index + * @param d_sizes Return output size here in 1st call (d_chars==nullptr) + * @param d_chars Write output here in 2nd call + * @param d_offsets Offsets to address output row within d_chars + */ + __device__ void operator()(size_type idx, + size_type* d_sizes, + char* d_chars, + cudf::detail::input_offsetalator d_offsets) const noexcept { auto const string_is_valid = !strings_has_nulls || strings_dv.is_valid_nocheck(idx); auto const rtimes_is_valid = !rtimes_has_nulls || repeat_times_dv.is_valid_nocheck(idx); From b066917b5227e79a4026556d4ad5cb7b21f06aee Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 22 Apr 2024 13:57:46 -0400 Subject: [PATCH 09/10] undo member to parameter change --- .../strings/detail/strings_children_ex.cuh | 83 +++++++++++-------- cpp/src/strings/combine/concatenate.cu | 24 ++---- cpp/src/strings/repeat_strings.cu | 16 ++-- 3 files changed, 63 insertions(+), 60 deletions(-) diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh index 8361a9d5146..26787d79374 100644 --- a/cpp/include/cudf/strings/detail/strings_children_ex.cuh +++ b/cpp/include/cudf/strings/detail/strings_children_ex.cuh @@ -44,27 +44,32 @@ namespace experimental { * @param d_offsets Offsets to address specific sections of d_chars */ template -CUDF_KERNEL __global__ void strings_children_kernel(SizeAndExecuteFunction fn, - size_type exec_size, - size_type* d_sizes, - char* d_chars, - cudf::detail::input_offsetalator d_offsets) +CUDF_KERNEL __global__ void strings_children_kernel(SizeAndExecuteFunction fn, size_type exec_size) { auto tid = cudf::detail::grid_1d::global_thread_id(); - if (tid < exec_size) { fn(tid, d_sizes, d_chars, d_offsets); } + if (tid < exec_size) { fn(tid); } } /** * @brief Creates child offsets and chars data by applying the template function that * can be used for computing the output size of each string as well as create the output * - * The `size_and_exec_fn` is expected declare an operator() function with 4 parameters. + * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter + * and 3 member variables: + * - `d_sizes`: output size in bytes of each output row for the 1st pass call + * - `d_chars`: output buffer for new string data for the 2nd pass call + * - `d_offsets`: used for addressing the specific output row data in `d_chars` + * + * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`. + * Null rows should be set with an output size of 0. + * * @code{.cpp} * struct size_and_exec_fn { - * __device__ void operator()(size_type thread_idx, - * size_type* d_sizes, - * char* d_chars, - * input_offsetalator d_offsets) + * size_type* d_sizes; + * char* d_chars; + * input_offsetalator d_offsets; + * + * __device__ void operator()(size_type thread_idx) * { * // functor-specific logic to resolve out_idx from thread_idx * if( !d_chars ) { @@ -77,10 +82,9 @@ CUDF_KERNEL __global__ void strings_children_kernel(SizeAndExecuteFunction fn, * }; * @endcode * - * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting 4 parameters: - * `size_type` index of the current thread, `size_type*` to hold computed row - * output sizes on the 1st pass and `char*` and `input_offsetalator` - * for the 2nd pass to resolve the output memory location for each row. + * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting + * an index parameter and three member variables: `size_type* d_sizes` + * `char* d_chars`, and `input_offsetalator d_offsets`. * * @param size_and_exec_fn This is called twice. Once for the output size of each string * and once again to fill in the memory pointed to by d_chars. @@ -99,30 +103,31 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, { // This is called twice -- once for computing sizes and once for writing chars. // Reducing the number of places size_and_exec_fn is inlined speeds up compile time. - auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn, - size_type* d_sizes, - char* d_chars, - cudf::detail::input_offsetalator d_offsets) { + auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) { auto constexpr block_size = 256; auto grid = cudf::detail::grid_1d{exec_size, block_size}; - strings_children_kernel<<>>( - size_and_exec_fn, exec_size, d_sizes, d_chars, d_offsets); + strings_children_kernel<<>>(size_and_exec_fn, + exec_size); }; // Compute the output sizes - auto output_sizes = rmm::device_uvector(strings_count, stream); - for_each_fn(size_and_exec_fn, output_sizes.data(), nullptr, cudf::detail::input_offsetalator{}); + auto output_sizes = rmm::device_uvector(strings_count, stream); + size_and_exec_fn.d_sizes = output_sizes.data(); + size_and_exec_fn.d_chars = nullptr; + for_each_fn(size_and_exec_fn); // Convert the sizes to offsets auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( output_sizes.begin(), output_sizes.end(), stream, mr); - auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); + size_and_exec_fn.d_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); // Now build the chars column rmm::device_uvector chars(bytes, stream, mr); + size_and_exec_fn.d_chars = chars.data(); // Execute the function fn again to fill in the chars data. - if (bytes > 0) { for_each_fn(size_and_exec_fn, output_sizes.data(), chars.data(), d_offsets); } + if (bytes > 0) { for_each_fn(size_and_exec_fn); } return std::pair(std::move(offsets_column), std::move(chars)); } @@ -131,13 +136,22 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, * @brief Creates child offsets and chars columns by applying the template function that * can be used for computing the output size of each string as well as create the output * - * The `size_and_exec_fn` is expected to be functor with 3 settable member variables. + * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter + * and 3 member variables: + * - `d_sizes`: output size in bytes of each output row for the 1st pass call + * - `d_chars`: output buffer for new string data for the 2nd pass call + * - `d_offsets`: used for addressing the specific output row data in `d_chars` + * + * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`. + * Null rows should be set with an output size of 0. + * * @code{.cpp} * struct size_and_exec_fn { - * __device__ void operator()(size_type idx, - * size_type* d_sizes, - * char* d_chars, - * input_offsetalator d_offsets) + * size_type* d_sizes; + * char* d_chars; + * input_offsetalator d_offsets; + * + * __device__ void operator()(size_type idx) * { * if( !d_chars ) { * d_sizes[idx] = output_size; @@ -149,13 +163,12 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, * }; * @endcode * - * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting 4 parameters: - * `size_type` index of the current thread, `size_type*` to hold computed row - * output sizes on the 1st pass and `char*` and `input_offsetalator` - * for the 2nd pass to resolve the output memory location for each row. + * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting + * an index parameter and three member variables: `size_type* d_sizes` + * `char* d_chars`, and `input_offsetalator d_offsets`. * * @param size_and_exec_fn This is called twice. Once for the output size of each string - * and once again to fill in the memory pointed to by d_chars. + * and once again to fill in the memory pointed to by `d_chars`. * @param strings_count Number of strings * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned columns' device memory diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index f81e58c55ca..97008fa94f8 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -50,6 +50,9 @@ struct concat_strings_base { table_device_view const d_table; string_scalar_device_view const d_narep; separator_on_nulls separate_nulls; + size_type* d_sizes; + char* d_chars; + cudf::detail::input_offsetalator d_offsets; /** * @brief Concatenate each table row to a single output string. @@ -61,11 +64,7 @@ struct concat_strings_base { * @param idx The current row to process * @param d_separator String to place in between each column's row */ - __device__ void process_row(size_type idx, - string_view const d_separator, - size_type* d_sizes, - char* d_chars, - cudf::detail::input_offsetalator d_offsets) + __device__ void process_row(size_type idx, string_view const d_separator) { if (!d_narep.is_valid() && thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { @@ -116,13 +115,7 @@ struct concat_strings_fn : concat_strings_base { { } - __device__ void operator()(std::size_t idx, - size_type* d_sizes, - char* d_chars, - cudf::detail::input_offsetalator d_offsets) - { - process_row(idx, d_separator, d_sizes, d_chars, d_offsets); - } + __device__ void operator()(std::size_t idx) { process_row(idx, d_separator); } }; } // namespace @@ -194,10 +187,7 @@ struct multi_separator_concat_fn : concat_strings_base { { } - __device__ void operator()(size_type idx, - size_type* d_sizes, - char* d_chars, - cudf::detail::input_offsetalator d_offsets) + __device__ void operator()(size_type idx) { if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) { if (!d_chars) { d_sizes[idx] = 0; } @@ -207,7 +197,7 @@ struct multi_separator_concat_fn : concat_strings_base { auto const d_separator = d_separators.is_valid(idx) ? d_separators.element(idx) : d_separator_narep.value(); // base class utility function handles the rest - process_row(idx, d_separator, d_sizes, d_chars, d_offsets); + process_row(idx, d_separator); } }; diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index b011d8d3ff9..de1d5e38e00 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -108,6 +108,9 @@ struct compute_size_and_repeat_fn { column_device_view const strings_dv; size_type const repeat_times; bool const has_nulls; + size_type* d_sizes; + char* d_chars; + cudf::detail::input_offsetalator d_offsets; /** * @brief Called by make_strings_children to build output @@ -117,10 +120,7 @@ struct compute_size_and_repeat_fn { * @param d_chars Write output here in 2nd call * @param d_offsets Offsets to address output row within d_chars */ - __device__ void operator()(size_type idx, - size_type* d_sizes, - char* d_chars, - cudf::detail::input_offsetalator d_offsets) const noexcept + __device__ void operator()(size_type idx) const noexcept { auto const str_idx = idx / repeat_times; // value cycles in [0, string_count) auto const repeat_idx = idx % repeat_times; // value cycles in [0, repeat_times) @@ -187,6 +187,9 @@ struct compute_sizes_and_repeat_fn { Iterator const repeat_times_iter; bool const strings_has_nulls; bool const rtimes_has_nulls; + size_type* d_sizes; + char* d_chars; + cudf::detail::input_offsetalator d_offsets; /** * @brief Called by make_strings_children to build output @@ -196,10 +199,7 @@ struct compute_sizes_and_repeat_fn { * @param d_chars Write output here in 2nd call * @param d_offsets Offsets to address output row within d_chars */ - __device__ void operator()(size_type idx, - size_type* d_sizes, - char* d_chars, - cudf::detail::input_offsetalator d_offsets) const noexcept + __device__ void operator()(size_type idx) const noexcept { auto const string_is_valid = !strings_has_nulls || strings_dv.is_valid_nocheck(idx); auto const rtimes_is_valid = !rtimes_has_nulls || repeat_times_dv.is_valid_nocheck(idx); From a285ebddd65f08a20beaed5f2df097a83ec25f22 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 22 Apr 2024 18:54:31 -0400 Subject: [PATCH 10/10] fix kernel decl and doxygen --- cpp/include/cudf/strings/detail/strings_children_ex.cuh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh index 26787d79374..6028c7e2437 100644 --- a/cpp/include/cudf/strings/detail/strings_children_ex.cuh +++ b/cpp/include/cudf/strings/detail/strings_children_ex.cuh @@ -39,12 +39,9 @@ namespace experimental { * * @param fn Functor to call in each thread * @param exec_size Total number of threads to be processed by this kernel - * @param d_sizes Output sizes array passed to the functor - * @param d_chars Output char buffer passed to the functor - * @param d_offsets Offsets to address specific sections of d_chars */ template -CUDF_KERNEL __global__ void strings_children_kernel(SizeAndExecuteFunction fn, size_type exec_size) +CUDF_KERNEL void strings_children_kernel(SizeAndExecuteFunction fn, size_type exec_size) { auto tid = cudf::detail::grid_1d::global_thread_id(); if (tid < exec_size) { fn(tid); }