From 5f3a561e2d416973837134847329729122f45762 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 3 May 2023 16:57:32 -0400 Subject: [PATCH 1/5] Performance improvement in cudf::strings::join_strings for long strings --- cpp/benchmarks/CMakeLists.txt | 4 +- cpp/benchmarks/string/join_strings.cpp | 59 ++++++ cpp/src/strings/combine/join.cu | 200 +++++++++++------- .../strings/combine/join_strings_tests.cpp | 13 ++ 4 files changed, 204 insertions(+), 72 deletions(-) create mode 100644 cpp/benchmarks/string/join_strings.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 565a396d913..9cbf0f60f62 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -301,7 +301,9 @@ ConfigureBench( string/url_decode.cu ) -ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp) +ConfigureNVBench( + STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp string/join_strings.cpp +) # ################################################################################################## # * json benchmark ------------------------------------------------------------------- diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp new file mode 100644 index 00000000000..950a742e1d9 --- /dev/null +++ b/cpp/benchmarks/string/join_strings.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include + +static void bench_join(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + // gather some throughput statistics as well + auto chars_size = input.chars_size(); + state.add_element_count(chars_size, "chars_size"); // number of bytes; + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(chars_size); // all bytes are written + + std::string separator(":"); + std::string narep("null"); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::strings::join_strings(input, separator, narep); + }); +} + +NVBENCH_BENCH(bench_join) + .set_name("strings_join") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index 9519cf66664..0b9d14d7396 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -21,8 +21,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -33,94 +33,152 @@ #include #include -#include #include -#include namespace cudf { namespace strings { namespace detail { -std::unique_ptr join_strings(strings_column_view const& strings, +namespace { + +/** + * @brief Threshold to decide on using string-per-thread vs the string-gather + * approaches. + * + * If the average byte length of a string in a column exceeds this value then + * the string-gather function is used. + * Otherwise, a regular string-parallel function is used. + * + * This value was found using the strings_join benchmark results. + */ +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 128; + +struct join_base_fn { + column_device_view const d_strings; + string_view d_separator; + string_scalar_device_view d_narep; + + __device__ std::pair process_string(size_type idx) const + { + string_view d_str{}; + string_view d_sep = (idx + 1 < d_strings.size()) ? d_separator : d_str; + if (d_strings.is_null(idx)) { + if (d_narep.is_valid()) { + d_str = d_narep.value(); + } else { + // if null and no narep, don't output a separator either + d_sep = d_str; + } + } else { + d_str = d_strings.element(idx); + } + return {d_str, d_sep}; + } +}; + +/** + * @brief Compute output sizes and write output bytes + * + * This functor is suitable for make_strings_children + */ +struct join_fn : public join_base_fn { + size_type* d_offsets{}; + char* d_chars{}; + + join_fn(column_device_view const d_strings, + string_view d_separator, + string_scalar_device_view d_narep) + : join_base_fn{d_strings, d_separator, d_narep} + { + } + + __device__ void operator()(size_type idx) const + { + auto [d_str, d_sep] = process_string(idx); + + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + size_type bytes = 0; + if (d_buffer) { + d_buffer = detail::copy_string(d_buffer, d_str); + d_buffer = detail::copy_string(d_buffer, d_sep); + } else { + bytes += d_str.size_bytes() + d_sep.size_bytes(); + } + if (!d_chars) { d_offsets[idx] = bytes; } + } +}; + +/** + * @brief Recode string pointers for use with make_strings_column that + * employs an efficient gather algorithm + */ +struct join_gather_fn : public join_base_fn { + string_index_pair* d_results; + + join_gather_fn(column_device_view const d_strings, + string_view d_separator, + string_scalar_device_view d_narep, + string_index_pair* d_results) + : join_base_fn{d_strings, d_separator, d_narep}, d_results{d_results} + { + } + + __device__ void operator()(size_type idx) const + { + auto [d_str, d_sep] = process_string(idx); + + auto d_output = d_results + (idx * 2); + d_output[0] = string_index_pair{d_str.data(), d_str.size_bytes()}; + d_output[1] = string_index_pair{d_sep.data(), d_sep.size_bytes()}; + } +}; + +} // namespace + +std::unique_ptr join_strings(strings_column_view const& input, string_scalar const& separator, string_scalar const& narep, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto strings_count = strings.size(); - if (strings_count == 0) return make_empty_column(type_id::STRING); + if (input.is_empty()) { return make_empty_column(type_id::STRING); } CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be a valid string_scalar"); string_view d_separator(separator.data(), separator.size()); auto d_narep = get_scalar_device_view(const_cast(narep)); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - - // create an offsets array for building the output memory layout - rmm::device_uvector output_offsets(strings_count + 1, stream); - auto d_output_offsets = output_offsets.data(); - // using inclusive-scan to compute last entry which is the total size - thrust::transform_inclusive_scan( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_output_offsets + 1, - [d_strings, d_separator, d_narep] __device__(size_type idx) { - size_type bytes = 0; - if (d_strings.is_null(idx)) { - if (!d_narep.is_valid()) return 0; // skip nulls - bytes += d_narep.size(); - } else - bytes += d_strings.element(idx).size_bytes(); - if ((idx + 1) < d_strings.size()) bytes += d_separator.size_bytes(); - return bytes; - }, - thrust::plus()); - - output_offsets.set_element_to_zero_async(0, stream); - // total size is the last entry - size_type const bytes = output_offsets.back_element(stream); - - // build offsets column (only 1 string so 2 offset entries) - auto offsets_column = - make_numeric_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED, stream, mr); - auto offsets_view = offsets_column->mutable_view(); - // set the first entry to 0 and the last entry to bytes - int32_t new_offsets[] = {0, static_cast(bytes)}; - CUDF_CUDA_TRY(cudaMemcpyAsync(offsets_view.data(), - new_offsets, - sizeof(new_offsets), - cudaMemcpyDefault, - stream.value())); - - // build null mask - // only one entry so it is either all valid or all null + auto d_strings = column_device_view::create(input.parent(), stream); + + auto chars_column = [&] { + if ((input.size() == input.null_count()) || + ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD)) { + return std::get<1>( + make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr)); + } + + rmm::device_uvector indices(2L * input.size(), stream); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + input.size(), + join_gather_fn{*d_strings, d_separator, d_narep, indices.data()}); + // build the strings column and commandeer the chars column + auto joined_col = make_strings_column(indices.begin(), indices.end(), stream, mr); + return std::move(joined_col->release().children.back()); + }(); + + // build the offsets: single string has offsets [0,chars-size] + rmm::device_uvector offsets(2, stream); + offsets.set_element_to_zero_async(0, stream); + offsets.set_element(1, chars_column->size(), stream); + auto offsets_column = std::make_unique(std::move(offsets), rmm::device_buffer{}, 0); + + // build the null mask: only one output row so it is either all-valid or all-null auto const null_count = - static_cast(strings.null_count() == strings_count && !narep.is_valid(stream)); - auto null_mask = null_count - ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr) - : rmm::device_buffer{0, stream, mr}; - auto chars_column = create_chars_child_column(bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - [d_strings, d_separator, d_narep, d_output_offsets, d_chars] __device__(size_type idx) { - size_type offset = d_output_offsets[idx]; - char* d_buffer = d_chars + offset; - if (d_strings.is_null(idx)) { - if (!d_narep.is_valid()) - return; // do not write to buffer if element is null (including separator) - d_buffer = detail::copy_string(d_buffer, d_narep.value()); - } else { - string_view d_str = d_strings.element(idx); - d_buffer = detail::copy_string(d_buffer, d_str); - } - if ((idx + 1) < d_strings.size()) d_buffer = detail::copy_string(d_buffer, d_separator); - }); + static_cast(input.null_count() == input.size() && !narep.is_valid(stream)); + auto null_mask = null_count + ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr) + : rmm::device_buffer{0, stream, mr}; return make_strings_column( 1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask)); diff --git a/cpp/tests/strings/combine/join_strings_tests.cpp b/cpp/tests/strings/combine/join_strings_tests.cpp index a265803929b..4abe45b663c 100644 --- a/cpp/tests/strings/combine/join_strings_tests.cpp +++ b/cpp/tests/strings/combine/join_strings_tests.cpp @@ -58,6 +58,19 @@ TEST_F(JoinStringsTest, Join) } } +TEST_F(JoinStringsTest, JoinLongStrings) +{ + std::string data(200, '0'); + cudf::test::strings_column_wrapper input({data, data, data, data}); + + auto results = + cudf::strings::join_strings(cudf::strings_column_view(input), cudf::string_scalar("+")); + + auto expected_data = data + "+" + data + "+" + data + "+" + data; + cudf::test::strings_column_wrapper expected({expected_data}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(JoinStringsTest, JoinZeroSizeStringsColumn) { cudf::column_view zero_size_strings_column( From 4fa67bbc1b73f20647444addf6cac7c6f1073654 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 8 May 2023 10:27:33 -0400 Subject: [PATCH 2/5] use iterator instead of temporary buffer for pairs --- cpp/src/strings/combine/join.cu | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index 0b9d14d7396..7209a89a680 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -112,6 +112,7 @@ struct join_fn : public join_base_fn { * @brief Recode string pointers for use with make_strings_column that * employs an efficient gather algorithm */ +#if 0 struct join_gather_fn : public join_base_fn { string_index_pair* d_results; @@ -132,7 +133,23 @@ struct join_gather_fn : public join_base_fn { d_output[1] = string_index_pair{d_sep.data(), d_sep.size_bytes()}; } }; +#else +struct join_gather_fn : public join_base_fn { + join_gather_fn(column_device_view const d_strings, + string_view d_separator, + string_scalar_device_view d_narep) + : join_base_fn{d_strings, d_separator, d_narep} + { + } + __device__ string_index_pair operator()(size_type idx) const + { + auto [d_str, d_sep] = process_string(idx / 2); + return idx % 2 ? string_index_pair{d_sep.data(), d_sep.size_bytes()} + : string_index_pair{d_str.data(), d_str.size_bytes()}; + } +}; +#endif } // namespace std::unique_ptr join_strings(strings_column_view const& input, @@ -151,19 +168,21 @@ std::unique_ptr join_strings(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); auto chars_column = [&] { - if ((input.size() == input.null_count()) || + if ((input.size() == input.null_count()) || //(input.size() > 0) || ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD)) { return std::get<1>( make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr)); } - rmm::device_uvector indices(2L * input.size(), stream); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - input.size(), - join_gather_fn{*d_strings, d_separator, d_narep, indices.data()}); + // rmm::device_uvector indices(2L * input.size(), stream); + // thrust::for_each_n(rmm::exec_policy(stream), + // thrust::make_counting_iterator(0), + // input.size(), + // join_gather_fn{*d_strings, d_separator, d_narep, indices.data()}); + auto indices = cudf::detail::make_counting_transform_iterator( + 0, join_gather_fn{*d_strings, d_separator, d_narep}); // build the strings column and commandeer the chars column - auto joined_col = make_strings_column(indices.begin(), indices.end(), stream, mr); + auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr); return std::move(joined_col->release().children.back()); }(); From defae23bfae00e960cddea7a700813e78c3e2dc4 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 8 May 2023 11:43:43 -0400 Subject: [PATCH 3/5] set threshold to 32 --- cpp/src/strings/combine/join.cu | 46 ++++++--------------------------- 1 file changed, 8 insertions(+), 38 deletions(-) diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index 7209a89a680..91979e7aa90 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -51,7 +51,7 @@ namespace { * * This value was found using the strings_join benchmark results. */ -constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 128; +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 32; struct join_base_fn { column_device_view const d_strings; @@ -108,32 +108,6 @@ struct join_fn : public join_base_fn { } }; -/** - * @brief Recode string pointers for use with make_strings_column that - * employs an efficient gather algorithm - */ -#if 0 -struct join_gather_fn : public join_base_fn { - string_index_pair* d_results; - - join_gather_fn(column_device_view const d_strings, - string_view d_separator, - string_scalar_device_view d_narep, - string_index_pair* d_results) - : join_base_fn{d_strings, d_separator, d_narep}, d_results{d_results} - { - } - - __device__ void operator()(size_type idx) const - { - auto [d_str, d_sep] = process_string(idx); - - auto d_output = d_results + (idx * 2); - d_output[0] = string_index_pair{d_str.data(), d_str.size_bytes()}; - d_output[1] = string_index_pair{d_sep.data(), d_sep.size_bytes()}; - } -}; -#else struct join_gather_fn : public join_base_fn { join_gather_fn(column_device_view const d_strings, string_view d_separator, @@ -145,11 +119,11 @@ struct join_gather_fn : public join_base_fn { __device__ string_index_pair operator()(size_type idx) const { auto [d_str, d_sep] = process_string(idx / 2); + // every other string is the separator return idx % 2 ? string_index_pair{d_sep.data(), d_sep.size_bytes()} : string_index_pair{d_str.data(), d_str.size_bytes()}; } }; -#endif } // namespace std::unique_ptr join_strings(strings_column_view const& input, @@ -168,25 +142,20 @@ std::unique_ptr join_strings(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); auto chars_column = [&] { - if ((input.size() == input.null_count()) || //(input.size() > 0) || - ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD)) { + // build the strings column and commandeer the chars column + if ((input.size() == input.null_count()) || + ((input.chars_size() / (input.size() - input.null_count())) <= AVG_CHAR_BYTES_THRESHOLD)) { return std::get<1>( make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr)); } - - // rmm::device_uvector indices(2L * input.size(), stream); - // thrust::for_each_n(rmm::exec_policy(stream), - // thrust::make_counting_iterator(0), - // input.size(), - // join_gather_fn{*d_strings, d_separator, d_narep, indices.data()}); + // dynamically feeds index pairs to build the output auto indices = cudf::detail::make_counting_transform_iterator( 0, join_gather_fn{*d_strings, d_separator, d_narep}); - // build the strings column and commandeer the chars column auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr); return std::move(joined_col->release().children.back()); }(); - // build the offsets: single string has offsets [0,chars-size] + // build the offsets: single string output has offsets [0,chars-size] rmm::device_uvector offsets(2, stream); offsets.set_element_to_zero_async(0, stream); offsets.set_element(1, chars_column->size(), stream); @@ -199,6 +168,7 @@ std::unique_ptr join_strings(strings_column_view const& input, ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr) : rmm::device_buffer{0, stream, mr}; + // perhaps this return a string_scalar instead of a single-row column return make_strings_column( 1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask)); } From ee45b1e29368d601ee509c18befd971e54f4d3cb Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 11 May 2023 09:27:55 -0400 Subject: [PATCH 4/5] change std::pair to thrust::pair --- cpp/benchmarks/string/join_strings.cpp | 3 +-- cpp/src/strings/combine/join.cu | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp index 950a742e1d9..a122c0022a9 100644 --- a/cpp/benchmarks/string/join_strings.cpp +++ b/cpp/benchmarks/string/join_strings.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include @@ -41,7 +40,7 @@ static void bench_join(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well - auto chars_size = input.chars_size(); + auto const chars_size = input.chars_size(); state.add_element_count(chars_size, "chars_size"); // number of bytes; state.add_global_memory_reads(chars_size); // all bytes are read; state.add_global_memory_writes(chars_size); // all bytes are written diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index 91979e7aa90..8641d9414a0 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -58,7 +58,7 @@ struct join_base_fn { string_view d_separator; string_scalar_device_view d_narep; - __device__ std::pair process_string(size_type idx) const + __device__ thrust::pair process_string(size_type idx) const { string_view d_str{}; string_view d_sep = (idx + 1 < d_strings.size()) ? d_separator : d_str; From dbea49b9c57cc599e4aaa22eb477eb20760ecd94 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 15 May 2023 10:12:50 -0400 Subject: [PATCH 5/5] use mr for offsets vector --- cpp/src/strings/combine/join.cu | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index e52987fa098..faf1be6a26f 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -157,10 +157,8 @@ std::unique_ptr join_strings(strings_column_view const& input, }(); // build the offsets: single string output has offsets [0,chars-size] - auto offsets = - cudf::detail::make_device_uvector_async(std::vector({0, chars_column->size()}), - stream, - rmm::mr::get_current_device_resource()); + auto offsets = cudf::detail::make_device_uvector_async( + std::vector({0, chars_column->size()}), stream, mr); auto offsets_column = std::make_unique(std::move(offsets), rmm::device_buffer{}, 0); // build the null mask: only one output row so it is either all-valid or all-null