diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index c3db3370c62..bdc72cc4535 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -300,8 +300,8 @@ ConfigureBench( ) ConfigureNVBench( - STRINGS_NVBENCH string/case.cpp string/char_types.cpp string/lengths.cpp string/like.cpp - string/reverse.cpp + STRINGS_NVBENCH string/case.cpp string/char_types.cpp string/join_strings.cpp string/lengths.cpp + string/like.cpp string/reverse.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp new file mode 100644 index 00000000000..a122c0022a9 --- /dev/null +++ b/cpp/benchmarks/string/join_strings.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +static void bench_join(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + // gather some throughput statistics as well + auto const chars_size = input.chars_size(); + state.add_element_count(chars_size, "chars_size"); // number of bytes; + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(chars_size); // all bytes are written + + std::string separator(":"); + std::string narep("null"); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::strings::join_strings(input, separator, narep); + }); +} + +NVBENCH_BENCH(bench_join) + .set_name("strings_join") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index 9519cf66664..faf1be6a26f 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -18,11 +18,12 @@ #include #include #include +#include #include #include #include +#include #include -#include #include #include #include @@ -33,95 +34,141 @@ #include #include -#include #include -#include namespace cudf { namespace strings { namespace detail { -std::unique_ptr join_strings(strings_column_view const& strings, +namespace { + +/** + * @brief Threshold to decide on using string-per-thread vs the string-gather + * approaches. + * + * If the average byte length of a string in a column exceeds this value then + * the string-gather function is used. + * Otherwise, a regular string-parallel function is used. + * + * This value was found using the strings_join benchmark results. + */ +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 32; + +struct join_base_fn { + column_device_view const d_strings; + string_view d_separator; + string_scalar_device_view d_narep; + + __device__ thrust::pair process_string(size_type idx) const + { + string_view d_str{}; + string_view d_sep = (idx + 1 < d_strings.size()) ? d_separator : d_str; + if (d_strings.is_null(idx)) { + if (d_narep.is_valid()) { + d_str = d_narep.value(); + } else { + // if null and no narep, don't output a separator either + d_sep = d_str; + } + } else { + d_str = d_strings.element(idx); + } + return {d_str, d_sep}; + } +}; + +/** + * @brief Compute output sizes and write output bytes + * + * This functor is suitable for make_strings_children + */ +struct join_fn : public join_base_fn { + size_type* d_offsets{}; + char* d_chars{}; + + join_fn(column_device_view const d_strings, + string_view d_separator, + string_scalar_device_view d_narep) + : join_base_fn{d_strings, d_separator, d_narep} + { + } + + __device__ void operator()(size_type idx) const + { + auto const [d_str, d_sep] = process_string(idx); + + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + size_type bytes = 0; + if (d_buffer) { + d_buffer = detail::copy_string(d_buffer, d_str); + d_buffer = detail::copy_string(d_buffer, d_sep); + } else { + bytes += d_str.size_bytes() + d_sep.size_bytes(); + } + if (!d_chars) { d_offsets[idx] = bytes; } + } +}; + +struct join_gather_fn : public join_base_fn { + join_gather_fn(column_device_view const d_strings, + string_view d_separator, + string_scalar_device_view d_narep) + : join_base_fn{d_strings, d_separator, d_narep} + { + } + + __device__ string_index_pair operator()(size_type idx) const + { + auto const [d_str, d_sep] = process_string(idx / 2); + // every other string is the separator + return idx % 2 ? string_index_pair{d_sep.data(), d_sep.size_bytes()} + : string_index_pair{d_str.data(), d_str.size_bytes()}; + } +}; +} // namespace + +std::unique_ptr join_strings(strings_column_view const& input, string_scalar const& separator, string_scalar const& narep, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto strings_count = strings.size(); - if (strings_count == 0) return make_empty_column(type_id::STRING); + if (input.is_empty()) { return make_empty_column(type_id::STRING); } CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be a valid string_scalar"); string_view d_separator(separator.data(), separator.size()); auto d_narep = get_scalar_device_view(const_cast(narep)); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - - // create an offsets array for building the output memory layout - rmm::device_uvector output_offsets(strings_count + 1, stream); - auto d_output_offsets = output_offsets.data(); - // using inclusive-scan to compute last entry which is the total size - thrust::transform_inclusive_scan( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_output_offsets + 1, - [d_strings, d_separator, d_narep] __device__(size_type idx) { - size_type bytes = 0; - if (d_strings.is_null(idx)) { - if (!d_narep.is_valid()) return 0; // skip nulls - bytes += d_narep.size(); - } else - bytes += d_strings.element(idx).size_bytes(); - if ((idx + 1) < d_strings.size()) bytes += d_separator.size_bytes(); - return bytes; - }, - thrust::plus()); - - output_offsets.set_element_to_zero_async(0, stream); - // total size is the last entry - size_type const bytes = output_offsets.back_element(stream); - - // build offsets column (only 1 string so 2 offset entries) - auto offsets_column = - make_numeric_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED, stream, mr); - auto offsets_view = offsets_column->mutable_view(); - // set the first entry to 0 and the last entry to bytes - int32_t new_offsets[] = {0, static_cast(bytes)}; - CUDF_CUDA_TRY(cudaMemcpyAsync(offsets_view.data(), - new_offsets, - sizeof(new_offsets), - cudaMemcpyDefault, - stream.value())); - - // build null mask - // only one entry so it is either all valid or all null + auto d_strings = column_device_view::create(input.parent(), stream); + + auto chars_column = [&] { + // build the strings column and commandeer the chars column + if ((input.size() == input.null_count()) || + ((input.chars_size() / (input.size() - input.null_count())) <= AVG_CHAR_BYTES_THRESHOLD)) { + return std::get<1>( + make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr)); + } + // dynamically feeds index pairs to build the output + auto indices = cudf::detail::make_counting_transform_iterator( + 0, join_gather_fn{*d_strings, d_separator, d_narep}); + auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr); + return std::move(joined_col->release().children.back()); + }(); + + // build the offsets: single string output has offsets [0,chars-size] + auto offsets = cudf::detail::make_device_uvector_async( + std::vector({0, chars_column->size()}), stream, mr); + auto offsets_column = std::make_unique(std::move(offsets), rmm::device_buffer{}, 0); + + // build the null mask: only one output row so it is either all-valid or all-null auto const null_count = - static_cast(strings.null_count() == strings_count && !narep.is_valid(stream)); - auto null_mask = null_count - ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr) - : rmm::device_buffer{0, stream, mr}; - auto chars_column = create_chars_child_column(bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - [d_strings, d_separator, d_narep, d_output_offsets, d_chars] __device__(size_type idx) { - size_type offset = d_output_offsets[idx]; - char* d_buffer = d_chars + offset; - if (d_strings.is_null(idx)) { - if (!d_narep.is_valid()) - return; // do not write to buffer if element is null (including separator) - d_buffer = detail::copy_string(d_buffer, d_narep.value()); - } else { - string_view d_str = d_strings.element(idx); - d_buffer = detail::copy_string(d_buffer, d_str); - } - if ((idx + 1) < d_strings.size()) d_buffer = detail::copy_string(d_buffer, d_separator); - }); + static_cast(input.null_count() == input.size() && !narep.is_valid(stream)); + auto null_mask = null_count + ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr) + : rmm::device_buffer{0, stream, mr}; + // perhaps this return a string_scalar instead of a single-row column return make_strings_column( 1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask)); } diff --git a/cpp/tests/strings/combine/join_strings_tests.cpp b/cpp/tests/strings/combine/join_strings_tests.cpp index a265803929b..4abe45b663c 100644 --- a/cpp/tests/strings/combine/join_strings_tests.cpp +++ b/cpp/tests/strings/combine/join_strings_tests.cpp @@ -58,6 +58,19 @@ TEST_F(JoinStringsTest, Join) } } +TEST_F(JoinStringsTest, JoinLongStrings) +{ + std::string data(200, '0'); + cudf::test::strings_column_wrapper input({data, data, data, data}); + + auto results = + cudf::strings::join_strings(cudf::strings_column_view(input), cudf::string_scalar("+")); + + auto expected_data = data + "+" + data + "+" + data + "+" + data; + cudf::test::strings_column_wrapper expected({expected_data}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(JoinStringsTest, JoinZeroSizeStringsColumn) { cudf::column_view zero_size_strings_column(