Skip to content

Commit

Permalink
Add experimental make_strings_children utility (#15363)
Browse files Browse the repository at this point in the history
Adds new `cudf::strings::detail::experimental::make_strings_children` which uses the offsetalator to build output columns. The current `d_offsets` member required by the given functors no longer stores sizes and offsets but is now split into `d_sizes` and `d_offsets` where `d_sizes` is computed in the first pass and then `d_offsets` is set to an offsetalator for building output in `d_chars`.

Once all the uses of `make_strings_children` (~50 or so) are converted to use the experimental implementation, this will replace the old implementation and the 'experimental' namespace will be removed.

This PR includes 2 changes, `repeat_strings` and `concatenate` (per row) since each use different overloaded `make_strings_children` functions to verify the code does not break any current tests.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Nghia Truong (https://github.com/ttnghia)

URL: #15363
  • Loading branch information
davidwendt authored Apr 23, 2024
1 parent 7341866 commit 702706d
Show file tree
Hide file tree
Showing 3 changed files with 232 additions and 33 deletions.
186 changes: 186 additions & 0 deletions cpp/include/cudf/strings/detail/strings_children_ex.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/strings/detail/strings_children.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>

namespace cudf {
namespace strings {
namespace detail {
namespace experimental {

/**
* @brief Kernel used by make_strings_children for calling the given functor
*
* @tparam SizeAndExecuteFunction Functor type to call in each thread
*
* @param fn Functor to call in each thread
* @param exec_size Total number of threads to be processed by this kernel
*/
template <typename SizeAndExecuteFunction>
CUDF_KERNEL void strings_children_kernel(SizeAndExecuteFunction fn, size_type exec_size)
{
auto tid = cudf::detail::grid_1d::global_thread_id();
if (tid < exec_size) { fn(tid); }
}

/**
* @brief Creates child offsets and chars data by applying the template function that
* can be used for computing the output size of each string as well as create the output
*
* The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
* and 3 member variables:
* - `d_sizes`: output size in bytes of each output row for the 1st pass call
* - `d_chars`: output buffer for new string data for the 2nd pass call
* - `d_offsets`: used for addressing the specific output row data in `d_chars`
*
* The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
* Null rows should be set with an output size of 0.
*
* @code{.cpp}
* struct size_and_exec_fn {
* size_type* d_sizes;
* char* d_chars;
* input_offsetalator d_offsets;
*
* __device__ void operator()(size_type thread_idx)
* {
* // functor-specific logic to resolve out_idx from thread_idx
* if( !d_chars ) {
* d_sizes[out_idx] = output_size;
* } else {
* auto d_output = d_chars + d_offsets[out_idx];
* // write characters to d_output
* }
* }
* };
* @endcode
*
* @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
* an index parameter and three member variables: `size_type* d_sizes`
* `char* d_chars`, and `input_offsetalator d_offsets`.
*
* @param size_and_exec_fn This is called twice. Once for the output size of each string
* and once again to fill in the memory pointed to by d_chars.
* @param exec_size Number of threads for executing the `size_and_exec_fn` function
* @param strings_count Number of strings
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned columns' device memory
* @return Offsets child column and chars vector for creating a strings column
*/
template <typename SizeAndExecuteFunction>
auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
size_type exec_size,
size_type strings_count,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
// This is called twice -- once for computing sizes and once for writing chars.
// Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
auto constexpr block_size = 256;
auto grid = cudf::detail::grid_1d{exec_size, block_size};
strings_children_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(size_and_exec_fn,
exec_size);
};

// Compute the output sizes
auto output_sizes = rmm::device_uvector<size_type>(strings_count, stream);
size_and_exec_fn.d_sizes = output_sizes.data();
size_and_exec_fn.d_chars = nullptr;
for_each_fn(size_and_exec_fn);

// Convert the sizes to offsets
auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
output_sizes.begin(), output_sizes.end(), stream, mr);
size_and_exec_fn.d_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());

// Now build the chars column
rmm::device_uvector<char> chars(bytes, stream, mr);
size_and_exec_fn.d_chars = chars.data();

// Execute the function fn again to fill in the chars data.
if (bytes > 0) { for_each_fn(size_and_exec_fn); }

return std::pair(std::move(offsets_column), std::move(chars));
}

/**
* @brief Creates child offsets and chars columns by applying the template function that
* can be used for computing the output size of each string as well as create the output
*
* The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
* and 3 member variables:
* - `d_sizes`: output size in bytes of each output row for the 1st pass call
* - `d_chars`: output buffer for new string data for the 2nd pass call
* - `d_offsets`: used for addressing the specific output row data in `d_chars`
*
* The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
* Null rows should be set with an output size of 0.
*
* @code{.cpp}
* struct size_and_exec_fn {
* size_type* d_sizes;
* char* d_chars;
* input_offsetalator d_offsets;
*
* __device__ void operator()(size_type idx)
* {
* if( !d_chars ) {
* d_sizes[idx] = output_size;
* } else {
* auto d_output = d_chars + d_offsets[idx];
* // write characters to d_output
* }
* }
* };
* @endcode
*
* @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
* an index parameter and three member variables: `size_type* d_sizes`
* `char* d_chars`, and `input_offsetalator d_offsets`.
*
* @param size_and_exec_fn This is called twice. Once for the output size of each string
* and once again to fill in the memory pointed to by `d_chars`.
* @param strings_count Number of strings
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned columns' device memory
* @return Offsets child column and chars vector for creating a strings column
*/
template <typename SizeAndExecuteFunction>
auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
size_type strings_count,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
}

} // namespace experimental
} // namespace detail
} // namespace strings
} // namespace cudf
21 changes: 12 additions & 9 deletions cpp/src/strings/combine/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator.cuh>
#include <cudf/detail/valid_if.cuh>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/detail/combine.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -49,8 +50,9 @@ struct concat_strings_base {
table_device_view const d_table;
string_scalar_device_view const d_narep;
separator_on_nulls separate_nulls;
size_type* d_offsets{};
char* d_chars{};
size_type* d_sizes;
char* d_chars;
cudf::detail::input_offsetalator d_offsets;

/**
* @brief Concatenate each table row to a single output string.
Expand All @@ -68,7 +70,7 @@ struct concat_strings_base {
thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) {
return col.is_null(idx);
})) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}

Expand All @@ -95,7 +97,7 @@ struct concat_strings_base {
write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element;
}

if (!d_chars) d_offsets[idx] = bytes;
if (!d_chars) { d_sizes[idx] = bytes; }
}
};

Expand All @@ -113,7 +115,7 @@ struct concat_strings_fn : concat_strings_base {
{
}

__device__ void operator()(size_type idx) { process_row(idx, d_separator); }
__device__ void operator()(std::size_t idx) { process_row(idx, d_separator); }
};

} // namespace
Expand Down Expand Up @@ -143,7 +145,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
// Create device views from the strings columns.
auto d_table = table_device_view::create(strings_columns, stream);
concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls};
auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr);

// create resulting null mask
auto [null_mask, null_count] = cudf::detail::valid_if(
Expand Down Expand Up @@ -188,7 +190,7 @@ struct multi_separator_concat_fn : concat_strings_base {
__device__ void operator()(size_type idx)
{
if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}

Expand Down Expand Up @@ -235,7 +237,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,

multi_separator_concat_fn mscf{
*d_table, separator_col_view, separator_rep, col_rep, separate_nulls};
auto [offsets_column, chars] = make_strings_children(mscf, strings_count, stream, mr);
auto [offsets_column, chars] =
experimental::make_strings_children(mscf, strings_count, stream, mr);

// Create resulting null mask
auto [null_mask, null_count] = cudf::detail::valid_if(
Expand Down
58 changes: 34 additions & 24 deletions cpp/src/strings/repeat_strings.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
#include <cudf/detail/indexalator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/detail/offsets_iterator.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/repeat_strings.hpp>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -107,22 +108,26 @@ struct compute_size_and_repeat_fn {
column_device_view const strings_dv;
size_type const repeat_times;
bool const has_nulls;

size_type* d_offsets{nullptr};

// If d_chars == nullptr: only compute sizes of the output strings.
// If d_chars != nullptr: only repeat strings.
char* d_chars{nullptr};

// `idx` will be in the range of [0, repeat_times * strings_count).
__device__ void operator()(size_type const idx) const noexcept
size_type* d_sizes;
char* d_chars;
cudf::detail::input_offsetalator d_offsets;

/**
* @brief Called by make_strings_children to build output
*
* @param idx Thread index in the range [0,repeat_times * strings_count)
* @param d_sizes Return output size here in 1st call (d_chars==nullptr)
* @param d_chars Write output here in 2nd call
* @param d_offsets Offsets to address output row within d_chars
*/
__device__ void operator()(size_type idx) const noexcept
{
auto const str_idx = idx / repeat_times; // value cycles in [0, string_count)
auto const repeat_idx = idx % repeat_times; // value cycles in [0, repeat_times)
auto const is_valid = !has_nulls || strings_dv.is_valid_nocheck(str_idx);

if (!d_chars && repeat_idx == 0) {
d_offsets[str_idx] =
d_sizes[str_idx] =
is_valid ? repeat_times * strings_dv.element<string_view>(str_idx).size_bytes() : 0;
}

Expand Down Expand Up @@ -161,8 +166,8 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
auto const strings_dv_ptr = column_device_view::create(input.parent(), stream);
auto const fn = compute_size_and_repeat_fn{*strings_dv_ptr, repeat_times, input.has_nulls()};

auto [offsets_column, chars] =
make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr);
auto [offsets_column, chars] = experimental::make_strings_children(
fn, strings_count * repeat_times, strings_count, stream, mr);
return make_strings_column(strings_count,
std::move(offsets_column),
chars.release(),
Expand All @@ -182,22 +187,27 @@ struct compute_sizes_and_repeat_fn {
Iterator const repeat_times_iter;
bool const strings_has_nulls;
bool const rtimes_has_nulls;

size_type* d_offsets{nullptr};

// If d_chars == nullptr: only compute sizes of the output strings.
// If d_chars != nullptr: only repeat strings.
char* d_chars{nullptr};

__device__ void operator()(size_type const idx) const noexcept
size_type* d_sizes;
char* d_chars;
cudf::detail::input_offsetalator d_offsets;

/**
* @brief Called by make_strings_children to build output
*
* @param idx Row index
* @param d_sizes Return output size here in 1st call (d_chars==nullptr)
* @param d_chars Write output here in 2nd call
* @param d_offsets Offsets to address output row within d_chars
*/
__device__ void operator()(size_type idx) const noexcept
{
auto const string_is_valid = !strings_has_nulls || strings_dv.is_valid_nocheck(idx);
auto const rtimes_is_valid = !rtimes_has_nulls || repeat_times_dv.is_valid_nocheck(idx);

// Any null input (either string or repeat_times value) will result in a null output.
auto const is_valid = string_is_valid && rtimes_is_valid;
if (!is_valid) {
if (!d_chars) { d_offsets[idx] = 0; }
if (!d_chars) { d_sizes[idx] = 0; }
return;
}

Expand All @@ -206,7 +216,7 @@ struct compute_sizes_and_repeat_fn {

if (!d_chars) {
// repeat_times could be negative
d_offsets[idx] = (repeat_times > 0) ? (repeat_times * d_str.size_bytes()) : 0;
d_sizes[idx] = std::max(repeat_times, 0) * d_str.size_bytes();
} else {
auto output_ptr = d_chars + d_offsets[idx];
while (repeat_times-- > 0) {
Expand Down Expand Up @@ -241,7 +251,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
input.has_nulls(),
repeat_times.has_nulls()};

auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr);

// We generate new bitmask by AND of the two input columns' bitmasks.
// Note that if either of the input columns are nullable, the output column will also be nullable
Expand Down

0 comments on commit 702706d

Please sign in to comment.