diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 96e24efac8a..5e7862f4b3b 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -276,8 +276,8 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.c ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) ConfigureNVBench( - TEXT_NVBENCH text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp text/normalize.cpp - text/replace.cpp text/tokenize.cpp + TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp new file mode 100644 index 00000000000..8a8bd9ae586 --- /dev/null +++ b/cpp/benchmarks/text/edit_distance.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +#include + +#include + +static void bench_edit_distance(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const strings_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const strings_table = create_random_table( + {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); + cudf::strings_column_view input1(strings_table->view().column(0)); + cudf::strings_column_view input2(strings_table->view().column(1)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + auto chars_size = input1.chars_size() + input2.chars_size(); + state.add_global_memory_reads(chars_size); + // output are integers (one per row) + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = nvtext::edit_distance(input1, input2); }); +} + +NVBENCH_BENCH(bench_edit_distance) + .set_name("edit_distance") + .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) + .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256}); diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu index fb0ecdb7677..1460be4fcf5 100644 --- a/cpp/src/text/edit_distance.cu +++ b/cpp/src/text/edit_distance.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,11 +29,13 @@ #include #include +#include #include #include #include #include #include +#include #include #include @@ -42,91 +44,76 @@ namespace detail { namespace { /** - * @brief Compute the edit-distance between two strings + * @brief Compute the Levenshtein distance for each string pair * - * The temporary buffer must be able to hold 3 int16 values for each character - * in the smaller of the two provided strings. + * Documentation here: https://www.cuelogic.com/blog/the-levenshtein-algorithm + * And here: https://en.wikipedia.org/wiki/Levenshtein_distance * * @param d_str First string * @param d_tgt Second string - * @param buffer Temporary memory buffer used for the calculation. - * @return Edit distance value + * @param buffer Working buffer for intermediate calculations + * @return The edit distance value */ -__device__ int32_t compute_distance(cudf::string_view const& d_str, - cudf::string_view const& d_tgt, - int16_t* buffer) +__device__ cudf::size_type compute_distance(cudf::string_view const& d_str, + cudf::string_view const& d_tgt, + cudf::size_type* buffer) { auto const str_length = d_str.length(); auto const tgt_length = d_tgt.length(); if (str_length == 0) return tgt_length; if (tgt_length == 0) return str_length; - auto itr_A = str_length < tgt_length ? d_str.begin() : d_tgt.begin(); - auto itr_B = str_length < tgt_length ? d_tgt.begin() : d_str.begin(); + auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin(); + auto itr = str_length < tgt_length ? d_tgt.begin() : d_str.begin(); // .first is min and .second is max - auto const lengths = std::minmax(str_length, tgt_length); + auto const [n, m] = std::minmax(str_length, tgt_length); // setup compute buffer pointers - auto line2 = buffer; - auto line1 = line2 + lengths.first; - auto line0 = line1 + lengths.first; - // range is both lengths - auto const range = lengths.first + lengths.second - 1; - for (cudf::size_type i = 0; i < range; ++i) { - auto tmp = line2; - line2 = line1; - line1 = line0; - line0 = tmp; - // checking pairs of characters - for (int x = (i < lengths.second ? 0 : i - lengths.second + 1); - (x < lengths.first) && (x < i + 1); - ++x) { - int const y = i - x; - itr_A += (x - itr_A.position()); // point to next - itr_B += (y - itr_B.position()); // characters to check - int16_t const w = - (((x > 0) && (y > 0)) ? line2[x - 1] : static_cast(std::max(x, y))) + - static_cast(*itr_A != *itr_B); // add 1 if characters do not match - int16_t const u = (y > 0 ? line1[x] : x + 1) + 1; - int16_t const v = (x > 0 ? line1[x - 1] : y + 1) + 1; - // store min(u,v,w) - line0[x] = std::min(std::min(u, v), w); + auto v0 = buffer; + auto v1 = v0 + n + 1; + // initialize v0 + thrust::sequence(thrust::seq, v0, v1); + + for (int i = 0; i < m; ++i, ++itr) { + auto itr_tgt = begin; + v1[0] = i + 1; + for (int j = 0; j < n; ++j, ++itr_tgt) { + auto sub_cost = v0[j] + (*itr != *itr_tgt); + auto del_cost = v0[j + 1] + 1; + auto ins_cost = v1[j] + 1; + v1[j + 1] = std::min(std::min(sub_cost, del_cost), ins_cost); } + thrust::swap(v0, v1); } - return static_cast(line0[lengths.first - 1]); + return v0[n]; } -/** - * @brief Compute the Levenshtein distance for each string. - * - * Documentation here: https://www.cuelogic.com/blog/the-levenshtein-algorithm - * And here: https://en.wikipedia.org/wiki/Levenshtein_distances - */ struct edit_distance_levenshtein_algorithm { cudf::column_device_view d_strings; // computing these cudf::column_device_view d_targets; // against these; - int16_t* d_buffer; // compute buffer for each string - int32_t* d_results; // input is buffer offset; output is edit distance + cudf::size_type* d_buffer; // compute buffer for each string + std::ptrdiff_t const* d_offsets; // locate sub-buffer for each string + cudf::size_type* d_results; // edit distance values - __device__ void operator()(cudf::size_type idx) + __device__ void operator()(cudf::size_type idx) const { auto d_str = d_strings.is_null(idx) ? cudf::string_view{} : d_strings.element(idx); auto d_tgt = [&] __device__ { // d_targets is also allowed to have only one entry - if (d_targets.is_null(idx)) return cudf::string_view{}; + if (d_targets.is_null(idx)) { return cudf::string_view{}; } return d_targets.size() == 1 ? d_targets.element(0) : d_targets.element(idx); }(); - d_results[idx] = compute_distance(d_str, d_tgt, d_buffer + d_results[idx]); + d_results[idx] = compute_distance(d_str, d_tgt, d_buffer + d_offsets[idx]); } }; struct edit_distance_matrix_levenshtein_algorithm { cudf::column_device_view d_strings; // computing these against itself - int16_t* d_buffer; // compute buffer for each string - int32_t const* d_offsets; // locate sub-buffer for each string - int32_t* d_results; // edit distance values + cudf::size_type* d_buffer; // compute buffer for each string + std::ptrdiff_t const* d_offsets; // locate sub-buffer for each string + cudf::size_type* d_results; // edit distance values - __device__ void operator()(cudf::size_type idx) + __device__ void operator()(cudf::size_type idx) const { auto const strings_count = d_strings.size(); auto const row = idx / strings_count; @@ -136,9 +123,9 @@ struct edit_distance_matrix_levenshtein_algorithm { d_strings.is_null(row) ? cudf::string_view{} : d_strings.element(row); cudf::string_view d_str2 = d_strings.is_null(col) ? cudf::string_view{} : d_strings.element(col); - auto work_buffer = d_buffer + d_offsets[idx - ((row + 1) * (row + 2)) / 2]; - int32_t const distance = (row == col) ? 0 : compute_distance(d_str1, d_str2, work_buffer); - d_results[idx] = distance; // top half of matrix + auto work_buffer = d_buffer + d_offsets[idx - ((row + 1) * (row + 2)) / 2]; + auto const distance = (row == col) ? 0 : compute_distance(d_str1, d_str2, work_buffer); + d_results[idx] = distance; // top half of matrix d_results[col * strings_count + row] = distance; // bottom half of matrix } }; @@ -153,10 +140,13 @@ std::unique_ptr edit_distance(cudf::strings_column_view const& str rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - cudf::size_type strings_count = strings.size(); - if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); - if (targets.size() > 1) + auto const strings_count = strings.size(); + if (strings_count == 0) { + return cudf::make_empty_column(cudf::data_type{cudf::type_to_id()}); + } + if (targets.size() > 1) { CUDF_EXPECTS(strings_count == targets.size(), "targets.size() must equal strings.size()"); + } // create device columns from the input columns auto strings_column = cudf::column_device_view::create(strings.parent(), stream); @@ -165,46 +155,46 @@ std::unique_ptr edit_distance(cudf::strings_column_view const& str auto d_targets = *targets_column; // calculate the size of the compute-buffer; - // we can use the output column buffer to hold the size/offset values temporarily - auto results = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, - strings_count, - rmm::device_buffer{0, stream, mr}, - 0, - stream, - mr); - auto d_results = results->mutable_view().data(); - + rmm::device_uvector offsets(strings_count, stream); thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), - d_results, + offsets.begin(), [d_strings, d_targets] __device__(auto idx) { - if (d_strings.is_null(idx) || d_targets.is_null(idx)) return int32_t{0}; + if (d_strings.is_null(idx) || d_targets.is_null(idx)) { + return cudf::size_type{0}; + } auto d_str = d_strings.element(idx); auto d_tgt = d_targets.size() == 1 ? d_targets.element(0) : d_targets.element(idx); - // just need 3 int16's for each character of the shorter string - return static_cast(std::min(d_str.length(), d_tgt.length()) * 3); + // just need 2 integers for each character of the shorter string + return (std::min(d_str.length(), d_tgt.length()) + 1) * 2; }); // get the total size of the temporary compute buffer - size_t compute_size = - thrust::reduce(rmm::exec_policy(stream), d_results, d_results + strings_count, size_t{0}); + int64_t compute_size = + thrust::reduce(rmm::exec_policy(stream), offsets.begin(), offsets.end(), int64_t{0}); // convert sizes to offsets in-place - thrust::exclusive_scan(rmm::exec_policy(stream), d_results, d_results + strings_count, d_results); + thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); // create the temporary compute buffer - rmm::device_uvector compute_buffer(compute_size, stream); + rmm::device_uvector compute_buffer(compute_size, stream); auto d_buffer = compute_buffer.data(); - // compute the edit distance into the output column in-place - // - on input, d_results is the offset to the working section of d_buffer for each row - // - on output, d_results is the calculated edit distance for that row + auto results = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, + strings_count, + rmm::device_buffer{0, stream, mr}, + 0, + stream, + mr); + auto d_results = results->mutable_view().data(); + + // compute the edit distance into the output column thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), strings_count, - edit_distance_levenshtein_algorithm{d_strings, d_targets, d_buffer, d_results}); + edit_distance_levenshtein_algorithm{d_strings, d_targets, d_buffer, offsets.data(), d_results}); return results; } @@ -216,7 +206,9 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con rmm::mr::device_memory_resource* mr) { cudf::size_type strings_count = strings.size(); - if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); + if (strings_count == 0) { + return cudf::make_empty_column(cudf::data_type{cudf::type_to_id()}); + } CUDF_EXPECTS(strings_count > 1, "the input strings must include at least 2 strings"); CUDF_EXPECTS(static_cast(strings_count) * static_cast(strings_count) < static_cast(std::numeric_limits().max()), @@ -230,7 +222,7 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con // We only need memory for half the size of the output matrix since the edit distance calculation // is commutative -- `distance(strings[i],strings[j]) == distance(strings[j],strings[i])` cudf::size_type n_upper = (strings_count * (strings_count - 1)) / 2; - rmm::device_uvector offsets(n_upper, stream); + rmm::device_uvector offsets(n_upper, stream); auto d_offsets = offsets.data(); CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value())); thrust::for_each_n( @@ -245,28 +237,29 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con d_strings.is_null(row) ? cudf::string_view{} : d_strings.element(row); cudf::string_view const d_str2 = d_strings.is_null(col) ? cudf::string_view{} : d_strings.element(col); - if (d_str1.empty() || d_str2.empty()) return; - // the temp size needed is 3 int16s per character of the shorter string - d_offsets[idx - ((row + 1) * (row + 2)) / 2] = std::min(d_str1.length(), d_str2.length()) * 3; + if (d_str1.empty() || d_str2.empty()) { return; } + // the temp size needed is 2 integers per character of the shorter string + d_offsets[idx - ((row + 1) * (row + 2)) / 2] = + (std::min(d_str1.length(), d_str2.length()) + 1) * 2; }); // get the total size for the compute buffer - size_t compute_size = - thrust::reduce(rmm::exec_policy(stream), offsets.begin(), offsets.end(), size_t{0}); + int64_t compute_size = + thrust::reduce(rmm::exec_policy(stream), offsets.begin(), offsets.end(), int64_t{0}); // convert sizes to offsets in-place thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); // create the compute buffer - rmm::device_uvector compute_buffer(compute_size, stream); + rmm::device_uvector compute_buffer(compute_size, stream); auto d_buffer = compute_buffer.data(); // compute the edit distance into the output column - auto results = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, + auto results = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, strings_count * strings_count, rmm::device_buffer{0, stream, mr}, 0, stream, mr); - auto d_results = results->mutable_view().data(); + auto d_results = results->mutable_view().data(); thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -274,20 +267,21 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con edit_distance_matrix_levenshtein_algorithm{d_strings, d_buffer, d_offsets, d_results}); // build a lists column of the results - auto offsets_column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, - strings_count + 1, - rmm::device_buffer{0, stream, mr}, - 0, - stream, - mr); + auto offsets_column = + cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, + strings_count + 1, + rmm::device_buffer{0, stream, mr}, + 0, + stream, + mr); thrust::transform_exclusive_scan( rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count + 1), - offsets_column->mutable_view().data(), + thrust::counting_iterator(0), + thrust::counting_iterator(strings_count + 1), + offsets_column->mutable_view().data(), [strings_count] __device__(auto idx) { return strings_count; }, - int32_t{0}, - thrust::plus()); + cudf::size_type{0}, + thrust::plus()); return cudf::make_lists_column(strings_count, std::move(offsets_column), std::move(results),