From 702706d7c2e86e900ffbca0568d6ff9d2e415975 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 23 Apr 2024 15:37:49 -0400 Subject: [PATCH] Add experimental make_strings_children utility (#15363) Adds new `cudf::strings::detail::experimental::make_strings_children` which uses the offsetalator to build output columns. The current `d_offsets` member required by the given functors no longer stores sizes and offsets but is now split into `d_sizes` and `d_offsets` where `d_sizes` is computed in the first pass and then `d_offsets` is set to an offsetalator for building output in `d_chars`. Once all the uses of `make_strings_children` (~50 or so) are converted to use the experimental implementation, this will replace the old implementation and the 'experimental' namespace will be removed. This PR includes 2 changes, `repeat_strings` and `concatenate` (per row) since each use different overloaded `make_strings_children` functions to verify the code does not break any current tests. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/15363 --- .../strings/detail/strings_children_ex.cuh | 186 ++++++++++++++++++ cpp/src/strings/combine/concatenate.cu | 21 +- cpp/src/strings/repeat_strings.cu | 58 +++--- 3 files changed, 232 insertions(+), 33 deletions(-) create mode 100644 cpp/include/cudf/strings/detail/strings_children_ex.cuh diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh new file mode 100644 index 00000000000..6028c7e2437 --- /dev/null +++ b/cpp/include/cudf/strings/detail/strings_children_ex.cuh @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace experimental { + +/** + * @brief Kernel used by make_strings_children for calling the given functor + * + * @tparam SizeAndExecuteFunction Functor type to call in each thread + * + * @param fn Functor to call in each thread + * @param exec_size Total number of threads to be processed by this kernel + */ +template +CUDF_KERNEL void strings_children_kernel(SizeAndExecuteFunction fn, size_type exec_size) +{ + auto tid = cudf::detail::grid_1d::global_thread_id(); + if (tid < exec_size) { fn(tid); } +} + +/** + * @brief Creates child offsets and chars data by applying the template function that + * can be used for computing the output size of each string as well as create the output + * + * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter + * and 3 member variables: + * - `d_sizes`: output size in bytes of each output row for the 1st pass call + * - `d_chars`: output buffer for new string data for the 2nd pass call + * - `d_offsets`: used for addressing the specific output row data in `d_chars` + * + * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`. + * Null rows should be set with an output size of 0. + * + * @code{.cpp} + * struct size_and_exec_fn { + * size_type* d_sizes; + * char* d_chars; + * input_offsetalator d_offsets; + * + * __device__ void operator()(size_type thread_idx) + * { + * // functor-specific logic to resolve out_idx from thread_idx + * if( !d_chars ) { + * d_sizes[out_idx] = output_size; + * } else { + * auto d_output = d_chars + d_offsets[out_idx]; + * // write characters to d_output + * } + * } + * }; + * @endcode + * + * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting + * an index parameter and three member variables: `size_type* d_sizes` + * `char* d_chars`, and `input_offsetalator d_offsets`. + * + * @param size_and_exec_fn This is called twice. Once for the output size of each string + * and once again to fill in the memory pointed to by d_chars. + * @param exec_size Number of threads for executing the `size_and_exec_fn` function + * @param strings_count Number of strings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned columns' device memory + * @return Offsets child column and chars vector for creating a strings column + */ +template +auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, + size_type exec_size, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + // This is called twice -- once for computing sizes and once for writing chars. + // Reducing the number of places size_and_exec_fn is inlined speeds up compile time. + auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) { + auto constexpr block_size = 256; + auto grid = cudf::detail::grid_1d{exec_size, block_size}; + strings_children_kernel<<>>(size_and_exec_fn, + exec_size); + }; + + // Compute the output sizes + auto output_sizes = rmm::device_uvector(strings_count, stream); + size_and_exec_fn.d_sizes = output_sizes.data(); + size_and_exec_fn.d_chars = nullptr; + for_each_fn(size_and_exec_fn); + + // Convert the sizes to offsets + auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( + output_sizes.begin(), output_sizes.end(), stream, mr); + size_and_exec_fn.d_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); + + // Now build the chars column + rmm::device_uvector chars(bytes, stream, mr); + size_and_exec_fn.d_chars = chars.data(); + + // Execute the function fn again to fill in the chars data. + if (bytes > 0) { for_each_fn(size_and_exec_fn); } + + return std::pair(std::move(offsets_column), std::move(chars)); +} + +/** + * @brief Creates child offsets and chars columns by applying the template function that + * can be used for computing the output size of each string as well as create the output + * + * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter + * and 3 member variables: + * - `d_sizes`: output size in bytes of each output row for the 1st pass call + * - `d_chars`: output buffer for new string data for the 2nd pass call + * - `d_offsets`: used for addressing the specific output row data in `d_chars` + * + * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`. + * Null rows should be set with an output size of 0. + * + * @code{.cpp} + * struct size_and_exec_fn { + * size_type* d_sizes; + * char* d_chars; + * input_offsetalator d_offsets; + * + * __device__ void operator()(size_type idx) + * { + * if( !d_chars ) { + * d_sizes[idx] = output_size; + * } else { + * auto d_output = d_chars + d_offsets[idx]; + * // write characters to d_output + * } + * } + * }; + * @endcode + * + * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting + * an index parameter and three member variables: `size_type* d_sizes` + * `char* d_chars`, and `input_offsetalator d_offsets`. + * + * @param size_and_exec_fn This is called twice. Once for the output size of each string + * and once again to fill in the memory pointed to by `d_chars`. + * @param strings_count Number of strings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned columns' device memory + * @return Offsets child column and chars vector for creating a strings column + */ +template +auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr); +} + +} // namespace experimental +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index 33d2de3cd07..97008fa94f8 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -17,11 +17,12 @@ #include #include #include +#include #include #include #include #include -#include +#include #include #include #include @@ -49,8 +50,9 @@ struct concat_strings_base { table_device_view const d_table; string_scalar_device_view const d_narep; separator_on_nulls separate_nulls; - size_type* d_offsets{}; - char* d_chars{}; + size_type* d_sizes; + char* d_chars; + cudf::detail::input_offsetalator d_offsets; /** * @brief Concatenate each table row to a single output string. @@ -68,7 +70,7 @@ struct concat_strings_base { thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { return col.is_null(idx); })) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } @@ -95,7 +97,7 @@ struct concat_strings_base { write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } - if (!d_chars) d_offsets[idx] = bytes; + if (!d_chars) { d_sizes[idx] = bytes; } } }; @@ -113,7 +115,7 @@ struct concat_strings_fn : concat_strings_base { { } - __device__ void operator()(size_type idx) { process_row(idx, d_separator); } + __device__ void operator()(std::size_t idx) { process_row(idx, d_separator); } }; } // namespace @@ -143,7 +145,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls}; - auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr); + auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr); // create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( @@ -188,7 +190,7 @@ struct multi_separator_concat_fn : concat_strings_base { __device__ void operator()(size_type idx) { if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } @@ -235,7 +237,8 @@ std::unique_ptr concatenate(table_view const& strings_columns, multi_separator_concat_fn mscf{ *d_table, separator_col_view, separator_rep, col_rep, separate_nulls}; - auto [offsets_column, chars] = make_strings_children(mscf, strings_count, stream, mr); + auto [offsets_column, chars] = + experimental::make_strings_children(mscf, strings_count, stream, mr); // Create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index 97168a7fbd7..de1d5e38e00 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -20,7 +20,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -107,22 +108,26 @@ struct compute_size_and_repeat_fn { column_device_view const strings_dv; size_type const repeat_times; bool const has_nulls; - - size_type* d_offsets{nullptr}; - - // If d_chars == nullptr: only compute sizes of the output strings. - // If d_chars != nullptr: only repeat strings. - char* d_chars{nullptr}; - - // `idx` will be in the range of [0, repeat_times * strings_count). - __device__ void operator()(size_type const idx) const noexcept + size_type* d_sizes; + char* d_chars; + cudf::detail::input_offsetalator d_offsets; + + /** + * @brief Called by make_strings_children to build output + * + * @param idx Thread index in the range [0,repeat_times * strings_count) + * @param d_sizes Return output size here in 1st call (d_chars==nullptr) + * @param d_chars Write output here in 2nd call + * @param d_offsets Offsets to address output row within d_chars + */ + __device__ void operator()(size_type idx) const noexcept { auto const str_idx = idx / repeat_times; // value cycles in [0, string_count) auto const repeat_idx = idx % repeat_times; // value cycles in [0, repeat_times) auto const is_valid = !has_nulls || strings_dv.is_valid_nocheck(str_idx); if (!d_chars && repeat_idx == 0) { - d_offsets[str_idx] = + d_sizes[str_idx] = is_valid ? repeat_times * strings_dv.element(str_idx).size_bytes() : 0; } @@ -161,8 +166,8 @@ std::unique_ptr repeat_strings(strings_column_view const& input, auto const strings_dv_ptr = column_device_view::create(input.parent(), stream); auto const fn = compute_size_and_repeat_fn{*strings_dv_ptr, repeat_times, input.has_nulls()}; - auto [offsets_column, chars] = - make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr); + auto [offsets_column, chars] = experimental::make_strings_children( + fn, strings_count * repeat_times, strings_count, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), chars.release(), @@ -182,14 +187,19 @@ struct compute_sizes_and_repeat_fn { Iterator const repeat_times_iter; bool const strings_has_nulls; bool const rtimes_has_nulls; - - size_type* d_offsets{nullptr}; - - // If d_chars == nullptr: only compute sizes of the output strings. - // If d_chars != nullptr: only repeat strings. - char* d_chars{nullptr}; - - __device__ void operator()(size_type const idx) const noexcept + size_type* d_sizes; + char* d_chars; + cudf::detail::input_offsetalator d_offsets; + + /** + * @brief Called by make_strings_children to build output + * + * @param idx Row index + * @param d_sizes Return output size here in 1st call (d_chars==nullptr) + * @param d_chars Write output here in 2nd call + * @param d_offsets Offsets to address output row within d_chars + */ + __device__ void operator()(size_type idx) const noexcept { auto const string_is_valid = !strings_has_nulls || strings_dv.is_valid_nocheck(idx); auto const rtimes_is_valid = !rtimes_has_nulls || repeat_times_dv.is_valid_nocheck(idx); @@ -197,7 +207,7 @@ struct compute_sizes_and_repeat_fn { // Any null input (either string or repeat_times value) will result in a null output. auto const is_valid = string_is_valid && rtimes_is_valid; if (!is_valid) { - if (!d_chars) { d_offsets[idx] = 0; } + if (!d_chars) { d_sizes[idx] = 0; } return; } @@ -206,7 +216,7 @@ struct compute_sizes_and_repeat_fn { if (!d_chars) { // repeat_times could be negative - d_offsets[idx] = (repeat_times > 0) ? (repeat_times * d_str.size_bytes()) : 0; + d_sizes[idx] = std::max(repeat_times, 0) * d_str.size_bytes(); } else { auto output_ptr = d_chars + d_offsets[idx]; while (repeat_times-- > 0) { @@ -241,7 +251,7 @@ std::unique_ptr repeat_strings(strings_column_view const& input, input.has_nulls(), repeat_times.has_nulls()}; - auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr); + auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr); // We generate new bitmask by AND of the two input columns' bitmasks. // Note that if either of the input columns are nullable, the output column will also be nullable