From 2817a7d07eb976a721068cf65fd776f10cc92ed7 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 10 Mar 2021 17:47:51 -0500 Subject: [PATCH 1/4] Add gbenchmark for strings::concatenate --- cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/string/combine_benchmark.cpp | 74 +++++++++++++++++++++ cpp/src/strings/combine.cu | 63 +++++++++--------- 3 files changed, 106 insertions(+), 32 deletions(-) create mode 100644 cpp/benchmarks/string/combine_benchmark.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index dfc340b1459..1b1e01d5e9e 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -180,6 +180,7 @@ ConfigureBench(SUBWORD_TOKENIZER_BENCH text/subword_benchmark.cpp) # - strings benchmark ------------------------------------------------------------------- ConfigureBench(STRINGS_BENCH string/case_benchmark.cpp + string/combine_benchmark.cpp string/contains_benchmark.cpp string/convert_durations_benchmark.cpp string/convert_floats_benchmark.cpp diff --git a/cpp/benchmarks/string/combine_benchmark.cpp b/cpp/benchmarks/string/combine_benchmark.cpp new file mode 100644 index 00000000000..36a5c80b383 --- /dev/null +++ b/cpp/benchmarks/string/combine_benchmark.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "string_bench_args.hpp" + +class StringCombine : public cudf::benchmark { +}; + +static void BM_combine(benchmark::State& state) +{ + cudf::size_type const n_rows{static_cast(state.range(0))}; + cudf::size_type const max_str_length{static_cast(state.range(1))}; + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 2, row_count{n_rows}, table_profile); + cudf::strings_column_view input1(table->view().column(0)); + cudf::strings_column_view input2(table->view().column(1)); + cudf::string_scalar separator("+"); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + cudf::strings::concatenate(table->view(), separator); + } + + state.SetBytesProcessed(state.iterations() * (input1.chars_size() + input2.chars_size())); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 4; + int const max_rowlen = 1 << 11; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define STRINGS_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(StringCombine, name) \ + (::benchmark::State & st) { BM_combine(st); } \ + BENCHMARK_REGISTER_F(StringCombine, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +STRINGS_BENCHMARK_DEFINE(concat) diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index 4e61d4d8c41..354afa2fb40 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -31,7 +32,7 @@ #include #include -#include +#include #include #include @@ -50,7 +51,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto num_columns = strings_columns.num_columns(); + auto const num_columns = strings_columns.num_columns(); CUDF_EXPECTS(num_columns > 0, "At least one column must be specified"); // check all columns are of type string CUDF_EXPECTS(std::all_of(strings_columns.begin(), @@ -59,7 +60,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, "All columns must be of type string"); if (num_columns == 1) // single strings column returns a copy return std::make_unique(*(strings_columns.begin()), stream, mr); - auto strings_count = strings_columns.num_rows(); + auto const strings_count = strings_columns.num_rows(); if (strings_count == 0) // empty begets empty return detail::make_empty_strings_column(stream, mr); @@ -88,12 +89,12 @@ std::unique_ptr concatenate(table_view const& strings_columns, // build offsets column by computing sizes of each string in the output auto offsets_transformer = [d_table, d_separator, d_narep] __device__(size_type row_idx) { // for this row (idx), iterate over each column and add up the bytes - bool null_element = + bool const null_element = thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [row_idx](auto const& d_column) { return d_column.is_null(row_idx); }); if (null_element && !d_narep.is_valid()) return 0; - size_type bytes = thrust::transform_reduce( + size_type const bytes = thrust::transform_reduce( thrust::seq, d_table.begin(), d_table.end(), @@ -105,9 +106,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, 0, thrust::plus()); // separator goes only in between elements - if (bytes > 0) // if not null - bytes -= d_separator.size_bytes(); // remove the last separator - return bytes; + return bytes == 0 ? 0 : (bytes - d_separator.size_bytes()); // remove the last separator }; auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), offsets_transformer); @@ -116,7 +115,8 @@ std::unique_ptr concatenate(table_view const& strings_columns, auto d_results_offsets = offsets_column->view().data(); // create the chars column - size_type bytes = thrust::device_pointer_cast(d_results_offsets)[strings_count]; + auto const bytes = + cudf::detail::get_value(offsets_column->view(), strings_count, stream); auto chars_column = strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); // fill the chars column @@ -127,14 +127,13 @@ std::unique_ptr concatenate(table_view const& strings_columns, strings_count, [d_table, num_columns, d_separator, d_narep, d_results_offsets, d_results_chars] __device__( size_type idx) { - bool null_element = thrust::any_of( + bool const null_element = thrust::any_of( thrust::seq, d_table.begin(), d_table.end(), [idx](column_device_view const& col) { return col.is_null(idx); }); if (null_element && !d_narep.is_valid()) return; // do not write to buffer at all if any column element for this row is null - size_type offset = d_results_offsets[idx]; - char* d_buffer = d_results_chars + offset; + char* d_buffer = d_results_chars + d_results_offsets[idx]; // write out each column's entry for this row for (size_type col_idx = 0; col_idx < num_columns; ++col_idx) { auto d_column = d_table.column(col_idx); @@ -173,8 +172,8 @@ std::unique_ptr join_strings(strings_column_view const& strings, auto d_strings = *strings_column; // create an offsets array for building the output memory layout - rmm::device_vector output_offsets(strings_count + 1); - auto d_output_offsets = output_offsets.data().get(); + rmm::device_uvector output_offsets(strings_count + 1, stream); + auto d_output_offsets = output_offsets.data(); // using inclusive-scan to compute last entry which is the total size thrust::transform_inclusive_scan( rmm::exec_policy(stream), @@ -192,9 +191,10 @@ std::unique_ptr join_strings(strings_column_view const& strings, return bytes; }, thrust::plus()); - CUDA_TRY(cudaMemsetAsync(d_output_offsets, 0, sizeof(size_type), stream.value())); + size_type const zero = 0; + output_offsets.set_element_async(0, zero, stream); // total size is the last entry - size_type bytes = output_offsets.back(); + size_type const bytes = output_offsets.back_element(stream); // build offsets column (only 1 string so 2 offset entries) auto offsets_column = @@ -254,7 +254,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto num_columns = strings_columns.num_columns(); + auto const num_columns = strings_columns.num_columns(); CUDF_EXPECTS(num_columns > 0, "At least one column must be specified"); // Check if all columns are of type string CUDF_EXPECTS(std::all_of(strings_columns.begin(), @@ -262,7 +262,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, [](auto c) { return c.type().id() == type_id::STRING; }), "All columns must be of type string"); - auto strings_count = strings_columns.num_rows(); + auto const strings_count = strings_columns.num_rows(); CUDF_EXPECTS(strings_count == separators.size(), "Separators column should be the same size as the strings columns"); if (strings_count == 0) // Empty begets empty @@ -277,7 +277,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, if (num_columns == 1) { // Shallow copy of the resultant strings - rmm::device_vector out_col_strings(strings_count); + rmm::device_uvector out_col_strings(strings_count, stream); // Device view of the only column in the table view auto const col0_ptr = column_device_view::create(strings_columns.column(0), stream); @@ -288,7 +288,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), - out_col_strings.data().get(), + out_col_strings.begin(), // Output depends on the separator [col0, invalid_str, separator_col_view, separator_rep, col_rep] __device__(auto ridx) { if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return invalid_str; @@ -334,7 +334,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return 0; // For this row (idx), iterate over each column and add up the bytes - bool all_nulls = + bool const all_nulls = thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& d_column) { return d_column.is_null(ridx); }); @@ -343,11 +343,11 @@ std::unique_ptr concatenate(table_view const& strings_columns, if (all_nulls && !col_rep.is_valid()) return 0; // There is at least one non-null column value (it can still be empty though) - auto separator_str = separator_col_view.is_valid(ridx) - ? separator_col_view.element(ridx) - : separator_rep.value(); + auto const separator_str = separator_col_view.is_valid(ridx) + ? separator_col_view.element(ridx) + : separator_rep.value(); - size_type bytes = thrust::transform_reduce( + size_type const bytes = thrust::transform_reduce( thrust::seq, d_table.begin(), d_table.end(), @@ -395,7 +395,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // to replace, do not write anything for this row if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return; - bool all_nulls = thrust::all_of( + bool const all_nulls = thrust::all_of( thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& col) { return col.is_null(ridx); }); @@ -404,14 +404,13 @@ std::unique_ptr concatenate(table_view const& strings_columns, // skip this row if (all_nulls && !col_rep.is_valid()) return; - size_type offset = d_results_offsets[ridx]; - char* d_buffer = d_results_chars + offset; + char* d_buffer = d_results_chars + d_results_offsets[ridx]; bool colval_written = false; // There is at least one non-null column value (it can still be empty though) - auto separator_str = separator_col_view.is_valid(ridx) - ? separator_col_view.element(ridx) - : separator_rep.value(); + auto const separator_str = separator_col_view.is_valid(ridx) + ? separator_col_view.element(ridx) + : separator_rep.value(); // Write out each column's entry for this row for (size_type col_idx = 0; col_idx < num_columns; ++col_idx) { From 68ab791955326ea251d507648bb7d68c818c1c0c Mon Sep 17 00:00:00 2001 From: davidwendt Date: Mon, 15 Mar 2021 12:38:44 -0400 Subject: [PATCH 2/4] cleanup some var decls -- add consts --- cpp/src/strings/combine.cu | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index 354afa2fb40..e91c40e519c 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -136,8 +136,8 @@ std::unique_ptr concatenate(table_view const& strings_columns, char* d_buffer = d_results_chars + d_results_offsets[idx]; // write out each column's entry for this row for (size_type col_idx = 0; col_idx < num_columns; ++col_idx) { - auto d_column = d_table.column(col_idx); - string_view d_str = + auto const d_column = d_table.column(col_idx); + string_view const d_str = d_column.is_null(idx) ? d_narep.value() : d_column.element(idx); d_buffer = detail::copy_string(d_buffer, d_str); // separator goes only in between elements @@ -414,18 +414,17 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Write out each column's entry for this row for (size_type col_idx = 0; col_idx < num_columns; ++col_idx) { - auto d_column = d_table.column(col_idx); - // If the column isn't valid and if there isn't a replacement for it, skip - // it + auto const d_column = d_table.column(col_idx); + // If the row is null and if there is no replacement, skip it if (d_column.is_null(ridx) && !col_rep.is_valid()) continue; // Separator goes only in between elements if (colval_written) d_buffer = detail::copy_string(d_buffer, separator_str); - string_view d_str = d_column.is_null(ridx) - ? col_rep.value() - : d_column.element(ridx); + string_view const d_str = d_column.is_null(ridx) + ? col_rep.value() + : d_column.element(ridx); d_buffer = detail::copy_string(d_buffer, d_str); colval_written = true; } From 5208be19318a2dffd1b6f3a11fb90d950cb984f8 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 17 Mar 2021 17:36:53 -0400 Subject: [PATCH 3/4] added comment about stream synch with set_element_async/back_element usage --- cpp/src/strings/combine.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index e91c40e519c..f9b8b9e0ea3 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -194,6 +194,8 @@ std::unique_ptr join_strings(strings_column_view const& strings, size_type const zero = 0; output_offsets.set_element_async(0, zero, stream); // total size is the last entry + // Note this call does a synchronize on the stream and thereby also protects the + // set_element_async parameter from going out of scope before it is used. size_type const bytes = output_offsets.back_element(stream); // build offsets column (only 1 string so 2 offset entries) From 76b0e0fe57660fea9fbc435038278f3d6f8b88a1 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Thu, 18 Mar 2021 10:40:59 -0400 Subject: [PATCH 4/4] fix header includes --- cpp/benchmarks/string/combine_benchmark.cpp | 6 ++---- cpp/benchmarks/string/string_bench_args.hpp | 2 ++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/string/combine_benchmark.cpp b/cpp/benchmarks/string/combine_benchmark.cpp index 36a5c80b383..2a5013a9ae7 100644 --- a/cpp/benchmarks/string/combine_benchmark.cpp +++ b/cpp/benchmarks/string/combine_benchmark.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "string_bench_args.hpp" + #include #include #include @@ -24,10 +26,6 @@ #include #include -#include - -#include "string_bench_args.hpp" - class StringCombine : public cudf::benchmark { }; diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp index 9c709b064dd..05ed1bf5b33 100644 --- a/cpp/benchmarks/string/string_bench_args.hpp +++ b/cpp/benchmarks/string/string_bench_args.hpp @@ -19,6 +19,8 @@ #include +#include + /** * @brief Generate row count and row length argument ranges for a string benchmark. *