Skip to content

Commit

Permalink
Improve performance of copy_if_else for long strings (#15017)
Browse files Browse the repository at this point in the history
Reworks the `cudf::strings::detail::copy_if_else()` to improve performance for long strings. The rework builds a vector of rows to pass to the `make_strings_column` factory that uses the optimized `gather_chars` function.
Also includes a benchmark for copy_if_else specifically for strings columns.

Closes #15014

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #15017
  • Loading branch information
davidwendt authored Feb 14, 2024
1 parent 825d30c commit f43f7c5
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 42 deletions.
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ ConfigureNVBench(
string/case.cpp
string/char_types.cpp
string/contains.cpp
string/copy_if_else.cpp
string/count.cpp
string/extract.cpp
string/gather.cpp
Expand Down
62 changes: 62 additions & 0 deletions cpp/benchmarks/string/copy_if_else.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/copying.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

static void bench_copy(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const str_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
auto const source_table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, str_profile);
auto const target_table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, str_profile);
data_profile const bool_profile = data_profile_builder().no_validity();
auto const booleans =
create_random_table({cudf::type_id::BOOL8}, row_count{num_rows}, bool_profile);

auto const source = source_table->view().column(0);
auto const target = target_table->view().column(0);
auto const left_right = booleans->view().column(0);

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
auto chars_size = cudf::strings_column_view(target).chars_size(cudf::get_default_stream());
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read;
state.add_global_memory_writes<nvbench::int8_t>(chars_size); // both columns are similar size

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
[[maybe_unused]] auto result = cudf::copy_if_else(source, target, left_right);
});
}

NVBENCH_BENCH(bench_copy)
.set_name("copy_if_else")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
63 changes: 21 additions & 42 deletions cpp/include/cudf/strings/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,16 @@
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/valid_if.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/strings/detail/strings_column_factories.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/optional.h>
#include <thrust/transform.h>

#include <cuda/functional>

Expand Down Expand Up @@ -65,55 +63,36 @@ std::unique_ptr<cudf::column> copy_if_else(StringIterLeft lhs_begin,
rmm::mr::device_memory_resource* mr)
{
auto strings_count = std::distance(lhs_begin, lhs_end);
if (strings_count == 0) return make_empty_column(type_id::STRING);
if (strings_count == 0) { return make_empty_column(type_id::STRING); }

// create null mask
auto valid_mask = cudf::detail::valid_if(
auto [null_mask, null_count] = cudf::detail::valid_if(
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
[lhs_begin, rhs_begin, filter_fn] __device__(size_type idx) {
return filter_fn(idx) ? lhs_begin[idx].has_value() : rhs_begin[idx].has_value();
},
stream,
mr);
size_type null_count = valid_mask.second;
auto null_mask = (null_count > 0) ? std::move(valid_mask.first) : rmm::device_buffer{};
if (null_count == 0) { null_mask = rmm::device_buffer{}; }

// build offsets column
auto offsets_transformer = cuda::proclaim_return_type<size_type>(
[lhs_begin, rhs_begin, filter_fn] __device__(size_type idx) {
auto const result = filter_fn(idx) ? lhs_begin[idx] : rhs_begin[idx];
return result.has_value() ? result->size_bytes() : 0;
});

auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0), offsets_transformer);
auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
auto d_offsets = offsets_column->view().template data<int32_t>();
// build vector of strings
rmm::device_uvector<string_index_pair> indices(strings_count, stream);
thrust::transform(rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
indices.begin(),
[lhs_begin, rhs_begin, filter_fn] __device__(size_type idx) {
auto const result = filter_fn(idx) ? lhs_begin[idx] : rhs_begin[idx];
auto const d_str = result.has_value() ? *result : string_view{"", 0};
return string_index_pair{d_str.data(), d_str.size_bytes()};
});

// build chars column
auto chars_column = create_chars_child_column(bytes, stream, mr);
auto d_chars = chars_column->mutable_view().template data<char>();
// fill in chars
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
strings_count,
[lhs_begin, rhs_begin, filter_fn, d_offsets, d_chars] __device__(size_type idx) {
auto const result = filter_fn(idx) ? lhs_begin[idx] : rhs_begin[idx];
if (!result.has_value()) return;
auto const d_str = *result;
memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
});

return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
null_count,
std::move(null_mask));
// convert vector into strings column
auto result = make_strings_column(indices.begin(), indices.end(), stream, mr);
result->set_null_mask(std::move(null_mask), null_count);
return result;
}

} // namespace detail
} // namespace strings
} // namespace cudf

0 comments on commit f43f7c5

Please sign in to comment.