diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 0f38138fff9..001a4cce122 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -311,6 +311,7 @@ set(STRINGS_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/string/copy_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/string/find_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/string/replace_benchmark.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/string/split_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/string/url_decode_benchmark.cpp") ConfigureBench(STRINGS_BENCH "${STRINGS_BENCH_SRC}") diff --git a/cpp/benchmarks/string/split_benchmark.cpp b/cpp/benchmarks/string/split_benchmark.cpp new file mode 100644 index 00000000000..35bedb1b767 --- /dev/null +++ b/cpp/benchmarks/string/split_benchmark.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +class StringSplit : public cudf::benchmark { +}; + +enum split_type { split, split_ws, record, record_ws }; + +static void BM_split(benchmark::State& state, split_type rt) +{ + cudf::size_type const n_rows{static_cast(state.range(0))}; + cudf::size_type const max_str_length{static_cast(state.range(1))}; + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + cudf::string_scalar target("+"); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + switch (rt) { + case split: cudf::strings::split(input, target); break; + case split_ws: cudf::strings::split(input); break; + case record: cudf::strings::split_record(input, target); break; + case record_ws: cudf::strings::split_record(input); break; + } + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { + for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { + // avoid generating combinations that exceed the cudf column limit + size_t total_chars = static_cast(row_count) * rowlen; + if (total_chars < std::numeric_limits::max()) { + b->Args({row_count, rowlen}); + } + } + } +} + +#define STRINGS_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(StringSplit, name) \ + (::benchmark::State & st) { BM_split(st, split_type::name); } \ + BENCHMARK_REGISTER_F(StringSplit, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +STRINGS_BENCHMARK_DEFINE(split) +STRINGS_BENCHMARK_DEFINE(split_ws) +STRINGS_BENCHMARK_DEFINE(record) +STRINGS_BENCHMARK_DEFINE(record_ws) diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index 00e4998b688..be6ace5e1fe 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -34,7 +35,7 @@ #include // upper_bound() #include // copy_if() #include // count_if() -#include // max() +#include // maximum() #include // transform() namespace cudf { @@ -429,7 +430,7 @@ std::unique_ptr split_fn(strings_column_view const& strings_column, rmm::mr::device_memory_resource* mr) { std::vector> results; - auto strings_count = strings_column.size(); + auto const strings_count = strings_column.size(); if (strings_count == 0) { results.push_back(make_empty_strings_column(stream, mr)); return std::make_unique
(std::move(results)); @@ -437,11 +438,13 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, auto d_offsets = strings_column.offsets().data(); d_offsets += strings_column.offset(); // nvbug-2808421 : do not combine with the previous line - auto chars_bytes = thrust::device_pointer_cast(d_offsets)[strings_count] - - thrust::device_pointer_cast(d_offsets)[0]; + auto const chars_bytes = + cudf::detail::get_value( + strings_column.offsets(), strings_column.offset() + strings_count, stream) - + cudf::detail::get_value(strings_column.offsets(), strings_column.offset(), stream); // count the number of delimiters in the entire column - size_type delimiter_count = + auto const delimiter_count = thrust::count_if(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(chars_bytes), @@ -450,8 +453,8 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, }); // create vector of every delimiter position in the chars column - rmm::device_vector delimiter_positions(delimiter_count); - auto d_positions = delimiter_positions.data().get(); + rmm::device_uvector delimiter_positions(delimiter_count, stream); + auto d_positions = delimiter_positions.data(); auto copy_end = thrust::copy_if(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(chars_bytes), @@ -461,8 +464,8 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, }); // create vector of string indices for each delimiter - rmm::device_vector string_indices(delimiter_count); // these will be strings that - auto d_string_indices = string_indices.data().get(); // only contain delimiters + rmm::device_uvector string_indices(delimiter_count, stream); // these will + auto d_string_indices = string_indices.data(); // be strings that only contain delimiters thrust::upper_bound(rmm::exec_policy(stream), d_offsets, d_offsets + strings_count, @@ -471,8 +474,8 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, string_indices.begin()); // compute the number of tokens per string - rmm::device_vector token_counts(strings_count); - auto d_token_counts = token_counts.data().get(); + rmm::device_uvector token_counts(strings_count, stream); + auto d_token_counts = token_counts.data(); // first, initialize token counts for strings without delimiters in them thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -482,6 +485,7 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, // null are 0, all others 1 return static_cast(tokenizer.is_valid(idx)); }); + // now compute the number of tokens in each string thrust::for_each_n( rmm::exec_policy(stream), @@ -493,8 +497,11 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, }); // the columns_count is the maximum number of tokens for any string - size_type columns_count = - *thrust::max_element(rmm::exec_policy(stream), token_counts.begin(), token_counts.end()); + auto const columns_count = thrust::reduce(rmm::exec_policy(stream), + token_counts.begin(), + token_counts.end(), + 0, + thrust::maximum{}); // boundary case: if no columns, return one null column (custrings issue #119) if (columns_count == 0) { results.push_back(std::make_unique( @@ -506,8 +513,8 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, } // create working area to hold all token positions - rmm::device_vector tokens(columns_count * strings_count); - string_index_pair* d_tokens = tokens.data().get(); + rmm::device_uvector tokens(columns_count * strings_count, stream); + string_index_pair* d_tokens = tokens.data(); // initialize the token positions // -- accounts for nulls, empty, and strings with no delimiter in them thrust::for_each_n(rmm::exec_policy(stream), @@ -748,20 +755,20 @@ std::unique_ptr
whitespace_split_fn(size_type strings_count, rmm::mr::device_memory_resource* mr) { // compute the number of tokens per string - size_type columns_count = 0; - rmm::device_vector token_counts(strings_count); - auto d_token_counts = token_counts.data().get(); - if (strings_count > 0) { - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_token_counts, - [tokenizer] __device__(size_type idx) { return tokenizer.count_tokens(idx); }); - // column count is the maximum number of tokens for any string - columns_count = - *thrust::max_element(rmm::exec_policy(stream), token_counts.begin(), token_counts.end()); - } + rmm::device_uvector token_counts(strings_count, stream); + auto d_token_counts = token_counts.data(); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_token_counts, + [tokenizer] __device__(size_type idx) { return tokenizer.count_tokens(idx); }); + + // column count is the maximum number of tokens for any string + size_type const columns_count = thrust::reduce(rmm::exec_policy(stream), + token_counts.begin(), + token_counts.end(), + 0, + thrust::maximum{}); std::vector> results; // boundary case: if no columns, return one null column (issue #119) @@ -775,8 +782,8 @@ std::unique_ptr
whitespace_split_fn(size_type strings_count, } // get the positions for every token - rmm::device_vector tokens(columns_count * strings_count); - string_index_pair* d_tokens = tokens.data().get(); + rmm::device_uvector tokens(columns_count * strings_count, stream); + string_index_pair* d_tokens = tokens.data(); thrust::fill(rmm::exec_policy(stream), d_tokens, d_tokens + (columns_count * strings_count), diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index f171dcabc5d..b780791c7a5 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -243,9 +243,9 @@ std::unique_ptr split_record_fn(strings_column_view const& strings, // last entry is the total number of tokens to be generated auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); // split each string into an array of index-pair values - rmm::device_vector tokens(total_tokens); + rmm::device_uvector tokens(total_tokens, stream); reader.d_token_offsets = d_offsets; - reader.d_tokens = tokens.data().get(); + reader.d_tokens = tokens.data(); thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), strings_count, reader); // convert the index-pairs into one big strings column