Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvement in cudf::strings::join_strings for long strings #13283

Merged
merged 15 commits into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,8 @@ ConfigureBench(
)

ConfigureNVBench(
STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp string/case.cpp
STRINGS_NVBENCH string/case.cpp string/join_strings.cpp string/lengths.cpp string/like.cpp
string/reverse.cpp
)

# ##################################################################################################
Expand Down
59 changes: 59 additions & 0 deletions cpp/benchmarks/string/join_strings.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

#include <cudf/strings/combine.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

static void bench_join(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const table_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
auto const table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
// gather some throughput statistics as well
auto chars_size = input.chars_size();
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
state.add_element_count(chars_size, "chars_size"); // number of bytes;
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read;
state.add_global_memory_writes<nvbench::int8_t>(chars_size); // all bytes are written

std::string separator(":");
std::string narep("null");
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::strings::join_strings(input, separator, narep);
});
}

NVBENCH_BENCH(bench_join)
.set_name("strings_join")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
189 changes: 118 additions & 71 deletions cpp/src/strings/combine/join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/detail/combine.hpp>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand All @@ -33,95 +33,142 @@
#include <rmm/exec_policy.hpp>

#include <thrust/for_each.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform_scan.h>

namespace cudf {
namespace strings {
namespace detail {

std::unique_ptr<column> join_strings(strings_column_view const& strings,
namespace {

/**
* @brief Threshold to decide on using string-per-thread vs the string-gather
* approaches.
*
* If the average byte length of a string in a column exceeds this value then
* the string-gather function is used.
* Otherwise, a regular string-parallel function is used.
*
* This value was found using the strings_join benchmark results.
*/
constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 32;

struct join_base_fn {
column_device_view const d_strings;
string_view d_separator;
string_scalar_device_view d_narep;
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

__device__ std::pair<string_view, string_view> process_string(size_type idx) const
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
{
string_view d_str{};
string_view d_sep = (idx + 1 < d_strings.size()) ? d_separator : d_str;
if (d_strings.is_null(idx)) {
if (d_narep.is_valid()) {
d_str = d_narep.value();
} else {
// if null and no narep, don't output a separator either
d_sep = d_str;
}
} else {
d_str = d_strings.element<string_view>(idx);
}
return {d_str, d_sep};
}
};

/**
* @brief Compute output sizes and write output bytes
*
* This functor is suitable for make_strings_children
*/
struct join_fn : public join_base_fn {
size_type* d_offsets{};
char* d_chars{};

join_fn(column_device_view const d_strings,
string_view d_separator,
string_scalar_device_view d_narep)
: join_base_fn{d_strings, d_separator, d_narep}
{
}

__device__ void operator()(size_type idx) const
{
auto [d_str, d_sep] = process_string(idx);
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
size_type bytes = 0;
if (d_buffer) {
d_buffer = detail::copy_string(d_buffer, d_str);
d_buffer = detail::copy_string(d_buffer, d_sep);
} else {
bytes += d_str.size_bytes() + d_sep.size_bytes();
}
if (!d_chars) { d_offsets[idx] = bytes; }
}
};

struct join_gather_fn : public join_base_fn {
join_gather_fn(column_device_view const d_strings,
string_view d_separator,
string_scalar_device_view d_narep)
: join_base_fn{d_strings, d_separator, d_narep}
{
}

__device__ string_index_pair operator()(size_type idx) const
{
auto [d_str, d_sep] = process_string(idx / 2);
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
// every other string is the separator
return idx % 2 ? string_index_pair{d_sep.data(), d_sep.size_bytes()}
: string_index_pair{d_str.data(), d_str.size_bytes()};
}
};
} // namespace

std::unique_ptr<column> join_strings(strings_column_view const& input,
string_scalar const& separator,
string_scalar const& narep,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto strings_count = strings.size();
if (strings_count == 0) return make_empty_column(type_id::STRING);
if (input.is_empty()) { return make_empty_column(type_id::STRING); }

CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be a valid string_scalar");

string_view d_separator(separator.data(), separator.size());
auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));

auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;

// create an offsets array for building the output memory layout
rmm::device_uvector<size_type> output_offsets(strings_count + 1, stream);
auto d_output_offsets = output_offsets.data();
// using inclusive-scan to compute last entry which is the total size
thrust::transform_inclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
d_output_offsets + 1,
[d_strings, d_separator, d_narep] __device__(size_type idx) {
size_type bytes = 0;
if (d_strings.is_null(idx)) {
if (!d_narep.is_valid()) return 0; // skip nulls
bytes += d_narep.size();
} else
bytes += d_strings.element<string_view>(idx).size_bytes();
if ((idx + 1) < d_strings.size()) bytes += d_separator.size_bytes();
return bytes;
},
thrust::plus<size_type>());

output_offsets.set_element_to_zero_async(0, stream);
// total size is the last entry
size_type const bytes = output_offsets.back_element(stream);

// build offsets column (only 1 string so 2 offset entries)
auto offsets_column =
make_numeric_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED, stream, mr);
auto offsets_view = offsets_column->mutable_view();
// set the first entry to 0 and the last entry to bytes
int32_t new_offsets[] = {0, static_cast<int32_t>(bytes)};
CUDF_CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
new_offsets,
sizeof(new_offsets),
cudaMemcpyDefault,
stream.value()));

// build null mask
// only one entry so it is either all valid or all null
auto d_strings = column_device_view::create(input.parent(), stream);

auto chars_column = [&] {
// build the strings column and commandeer the chars column
if ((input.size() == input.null_count()) ||
((input.chars_size() / (input.size() - input.null_count())) <= AVG_CHAR_BYTES_THRESHOLD)) {
return std::get<1>(
make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr));
}
// dynamically feeds index pairs to build the output
auto indices = cudf::detail::make_counting_transform_iterator(
0, join_gather_fn{*d_strings, d_separator, d_narep});
auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr);
return std::move(joined_col->release().children.back());
}();

// build the offsets: single string output has offsets [0,chars-size]
rmm::device_uvector<size_type> offsets(2, stream);
offsets.set_element_to_zero_async(0, stream);
offsets.set_element(1, chars_column->size(), stream);
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
auto offsets_column = std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);

// build the null mask: only one output row so it is either all-valid or all-null
auto const null_count =
static_cast<size_type>(strings.null_count() == strings_count && !narep.is_valid(stream));
auto null_mask = null_count
? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
: rmm::device_buffer{0, stream, mr};
auto chars_column = create_chars_child_column(bytes, stream, mr);
auto d_chars = chars_column->mutable_view().data<char>();
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
strings_count,
[d_strings, d_separator, d_narep, d_output_offsets, d_chars] __device__(size_type idx) {
size_type offset = d_output_offsets[idx];
char* d_buffer = d_chars + offset;
if (d_strings.is_null(idx)) {
if (!d_narep.is_valid())
return; // do not write to buffer if element is null (including separator)
d_buffer = detail::copy_string(d_buffer, d_narep.value());
} else {
string_view d_str = d_strings.element<string_view>(idx);
d_buffer = detail::copy_string(d_buffer, d_str);
}
if ((idx + 1) < d_strings.size()) d_buffer = detail::copy_string(d_buffer, d_separator);
});
static_cast<size_type>(input.null_count() == input.size() && !narep.is_valid(stream));
auto null_mask = null_count
? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
: rmm::device_buffer{0, stream, mr};

// perhaps this return a string_scalar instead of a single-row column
return make_strings_column(
1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask));
}
Expand Down
13 changes: 13 additions & 0 deletions cpp/tests/strings/combine/join_strings_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ TEST_F(JoinStringsTest, Join)
}
}

TEST_F(JoinStringsTest, JoinLongStrings)
{
std::string data(200, '0');
cudf::test::strings_column_wrapper input({data, data, data, data});

auto results =
cudf::strings::join_strings(cudf::strings_column_view(input), cudf::string_scalar("+"));

auto expected_data = data + "+" + data + "+" + data + "+" + data;
cudf::test::strings_column_wrapper expected({expected_data});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(JoinStringsTest, JoinZeroSizeStringsColumn)
{
cudf::column_view zero_size_strings_column(
Expand Down