Skip to content

Commit

Permalink
Performance improvement in cudf::strings::join_strings for long strin…
Browse files Browse the repository at this point in the history
…gs (#13283)

Improves performance for longer strings with `cudf::strings::join_strings()` API.

The new implementation is inspired by recent changes in json/writer_impl.cu which uses the `make_strings_children` factor that accepts an iterator of string pointers to gather the results into a single column of data but does lose some performance for smallish-strings (less than 32 bytes). So the original code is refactored to work on smaller strings.
Also, a benchmark implementation is added in this PR to test against various sized input columns.
For longer strings, the performance improvement is up to 100x for very long strings (>512 bytes).

Reference #13048

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: #13283
  • Loading branch information
davidwendt authored May 15, 2023
1 parent 23a77a6 commit 4c456cb
Show file tree
Hide file tree
Showing 4 changed files with 191 additions and 73 deletions.
4 changes: 2 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,8 @@ ConfigureBench(
)

ConfigureNVBench(
STRINGS_NVBENCH string/case.cpp string/char_types.cpp string/lengths.cpp string/like.cpp
string/reverse.cpp
STRINGS_NVBENCH string/case.cpp string/char_types.cpp string/join_strings.cpp string/lengths.cpp
string/like.cpp string/reverse.cpp
)

# ##################################################################################################
Expand Down
58 changes: 58 additions & 0 deletions cpp/benchmarks/string/join_strings.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/strings/combine.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

static void bench_join(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const table_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
auto const table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
// gather some throughput statistics as well
auto const chars_size = input.chars_size();
state.add_element_count(chars_size, "chars_size"); // number of bytes;
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read;
state.add_global_memory_writes<nvbench::int8_t>(chars_size); // all bytes are written

std::string separator(":");
std::string narep("null");
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::strings::join_strings(input, separator, narep);
});
}

NVBENCH_BENCH(bench_join)
.set_name("strings_join")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
189 changes: 118 additions & 71 deletions cpp/src/strings/combine/join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/detail/combine.hpp>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand All @@ -33,95 +34,141 @@
#include <rmm/exec_policy.hpp>

#include <thrust/for_each.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform_scan.h>

namespace cudf {
namespace strings {
namespace detail {

std::unique_ptr<column> join_strings(strings_column_view const& strings,
namespace {

/**
* @brief Threshold to decide on using string-per-thread vs the string-gather
* approaches.
*
* If the average byte length of a string in a column exceeds this value then
* the string-gather function is used.
* Otherwise, a regular string-parallel function is used.
*
* This value was found using the strings_join benchmark results.
*/
constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 32;

struct join_base_fn {
column_device_view const d_strings;
string_view d_separator;
string_scalar_device_view d_narep;

__device__ thrust::pair<string_view, string_view> process_string(size_type idx) const
{
string_view d_str{};
string_view d_sep = (idx + 1 < d_strings.size()) ? d_separator : d_str;
if (d_strings.is_null(idx)) {
if (d_narep.is_valid()) {
d_str = d_narep.value();
} else {
// if null and no narep, don't output a separator either
d_sep = d_str;
}
} else {
d_str = d_strings.element<string_view>(idx);
}
return {d_str, d_sep};
}
};

/**
* @brief Compute output sizes and write output bytes
*
* This functor is suitable for make_strings_children
*/
struct join_fn : public join_base_fn {
size_type* d_offsets{};
char* d_chars{};

join_fn(column_device_view const d_strings,
string_view d_separator,
string_scalar_device_view d_narep)
: join_base_fn{d_strings, d_separator, d_narep}
{
}

__device__ void operator()(size_type idx) const
{
auto const [d_str, d_sep] = process_string(idx);

char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
size_type bytes = 0;
if (d_buffer) {
d_buffer = detail::copy_string(d_buffer, d_str);
d_buffer = detail::copy_string(d_buffer, d_sep);
} else {
bytes += d_str.size_bytes() + d_sep.size_bytes();
}
if (!d_chars) { d_offsets[idx] = bytes; }
}
};

struct join_gather_fn : public join_base_fn {
join_gather_fn(column_device_view const d_strings,
string_view d_separator,
string_scalar_device_view d_narep)
: join_base_fn{d_strings, d_separator, d_narep}
{
}

__device__ string_index_pair operator()(size_type idx) const
{
auto const [d_str, d_sep] = process_string(idx / 2);
// every other string is the separator
return idx % 2 ? string_index_pair{d_sep.data(), d_sep.size_bytes()}
: string_index_pair{d_str.data(), d_str.size_bytes()};
}
};
} // namespace

std::unique_ptr<column> join_strings(strings_column_view const& input,
string_scalar const& separator,
string_scalar const& narep,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto strings_count = strings.size();
if (strings_count == 0) return make_empty_column(type_id::STRING);
if (input.is_empty()) { return make_empty_column(type_id::STRING); }

CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be a valid string_scalar");

string_view d_separator(separator.data(), separator.size());
auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));

auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;

// create an offsets array for building the output memory layout
rmm::device_uvector<size_type> output_offsets(strings_count + 1, stream);
auto d_output_offsets = output_offsets.data();
// using inclusive-scan to compute last entry which is the total size
thrust::transform_inclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
d_output_offsets + 1,
[d_strings, d_separator, d_narep] __device__(size_type idx) {
size_type bytes = 0;
if (d_strings.is_null(idx)) {
if (!d_narep.is_valid()) return 0; // skip nulls
bytes += d_narep.size();
} else
bytes += d_strings.element<string_view>(idx).size_bytes();
if ((idx + 1) < d_strings.size()) bytes += d_separator.size_bytes();
return bytes;
},
thrust::plus<size_type>());

output_offsets.set_element_to_zero_async(0, stream);
// total size is the last entry
size_type const bytes = output_offsets.back_element(stream);

// build offsets column (only 1 string so 2 offset entries)
auto offsets_column =
make_numeric_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED, stream, mr);
auto offsets_view = offsets_column->mutable_view();
// set the first entry to 0 and the last entry to bytes
int32_t new_offsets[] = {0, static_cast<int32_t>(bytes)};
CUDF_CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
new_offsets,
sizeof(new_offsets),
cudaMemcpyDefault,
stream.value()));

// build null mask
// only one entry so it is either all valid or all null
auto d_strings = column_device_view::create(input.parent(), stream);

auto chars_column = [&] {
// build the strings column and commandeer the chars column
if ((input.size() == input.null_count()) ||
((input.chars_size() / (input.size() - input.null_count())) <= AVG_CHAR_BYTES_THRESHOLD)) {
return std::get<1>(
make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr));
}
// dynamically feeds index pairs to build the output
auto indices = cudf::detail::make_counting_transform_iterator(
0, join_gather_fn{*d_strings, d_separator, d_narep});
auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr);
return std::move(joined_col->release().children.back());
}();

// build the offsets: single string output has offsets [0,chars-size]
auto offsets = cudf::detail::make_device_uvector_async(
std::vector<size_type>({0, chars_column->size()}), stream, mr);
auto offsets_column = std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);

// build the null mask: only one output row so it is either all-valid or all-null
auto const null_count =
static_cast<size_type>(strings.null_count() == strings_count && !narep.is_valid(stream));
auto null_mask = null_count
? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
: rmm::device_buffer{0, stream, mr};
auto chars_column = create_chars_child_column(bytes, stream, mr);
auto d_chars = chars_column->mutable_view().data<char>();
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
strings_count,
[d_strings, d_separator, d_narep, d_output_offsets, d_chars] __device__(size_type idx) {
size_type offset = d_output_offsets[idx];
char* d_buffer = d_chars + offset;
if (d_strings.is_null(idx)) {
if (!d_narep.is_valid())
return; // do not write to buffer if element is null (including separator)
d_buffer = detail::copy_string(d_buffer, d_narep.value());
} else {
string_view d_str = d_strings.element<string_view>(idx);
d_buffer = detail::copy_string(d_buffer, d_str);
}
if ((idx + 1) < d_strings.size()) d_buffer = detail::copy_string(d_buffer, d_separator);
});
static_cast<size_type>(input.null_count() == input.size() && !narep.is_valid(stream));
auto null_mask = null_count
? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
: rmm::device_buffer{0, stream, mr};

// perhaps this return a string_scalar instead of a single-row column
return make_strings_column(
1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask));
}
Expand Down
13 changes: 13 additions & 0 deletions cpp/tests/strings/combine/join_strings_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ TEST_F(JoinStringsTest, Join)
}
}

TEST_F(JoinStringsTest, JoinLongStrings)
{
std::string data(200, '0');
cudf::test::strings_column_wrapper input({data, data, data, data});

auto results =
cudf::strings::join_strings(cudf::strings_column_view(input), cudf::string_scalar("+"));

auto expected_data = data + "+" + data + "+" + data + "+" + data;
cudf::test::strings_column_wrapper expected({expected_data});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(JoinStringsTest, JoinZeroSizeStringsColumn)
{
cudf::column_view zero_size_strings_column(
Expand Down

0 comments on commit 4c456cb

Please sign in to comment.