From 35885c57200922da477eda602aa500f96d7073fa Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 16 Mar 2023 16:54:37 -0400 Subject: [PATCH 01/33] Add nvtext::minhash function --- cpp/CMakeLists.txt | 1 + cpp/include/nvtext/minhash.hpp | 52 ++++++++++++++++ cpp/src/text/minhash.cu | 104 +++++++++++++++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/text/minhash_tests.cpp | 56 +++++++++++++++++ 5 files changed, 214 insertions(+) create mode 100644 cpp/include/nvtext/minhash.hpp create mode 100644 cpp/src/text/minhash.cu create mode 100644 cpp/tests/text/minhash_tests.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 12b812d0bbe..f5f6fc5a656 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -577,6 +577,7 @@ add_library( src/text/detokenize.cu src/text/edit_distance.cu src/text/generate_ngrams.cu + src/text/minhash.cu src/text/ngrams_tokenize.cu src/text/normalize.cu src/text/replace.cu diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp new file mode 100644 index 00000000000..d5ab85358ac --- /dev/null +++ b/cpp/include/nvtext/minhash.hpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace nvtext { +/** + * @addtogroup nvtext_minhash + * @{ + * @file + */ + +/** + * @brief Returns the minhash value for each string + * + * Hash values are computed from substrings of each string and the + * minimum hash value is returned for each string. + * + * All null row entries are ignored and the output contains all valid rows. + * + * @param input Strings column to compute minhash + * @param width The character width used for apply substrings; + * Any string smaller than this width will not be hashed. + * Default is 4 characters. + * @param seed Seed value used for the Murmur32_3 hash algorithm + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Minhash values for each string in input + */ +std::unique_ptr minhash( + cudf::strings_column_view const& input, + cudf::size_type width = 4, + cudf::hash_value_type = cudf::DEFAULT_HASH_SEED, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group +} // namespace nvtext diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu new file mode 100644 index 00000000000..fc63c4a99d2 --- /dev/null +++ b/cpp/src/text/minhash.cu @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace nvtext { +namespace detail { +namespace { + +struct minhash_fn { + cudf::column_device_view d_strings; + cudf::size_type width; + cudf::hash_value_type seed; + + __device__ cudf::hash_value_type operator()(cudf::size_type idx) const + { + if (d_strings.is_null(idx)) return 0; + auto const d_str = d_strings.element(idx); + + auto mh = cudf::hash_value_type{0}; + for (cudf::size_type pos = 0; pos < d_str.length() - (width - 1); ++pos) { + auto const ss = d_str.substr(pos, width); + auto const hasher = cudf::detail::MurmurHash3_32{seed}; + auto const hvalue = hasher(ss); + // cudf::detail::hash_combine(seed, hasher(ss)); matches cudf::hash() result + + mh = mh > 0 ? cudf::detail::min(hvalue, mh) : hvalue; + } + + return mh; + } +}; + +} // namespace + +std::unique_ptr minhash(cudf::strings_column_view const& input, + cudf::size_type width, + cudf::hash_value_type seed, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(width > 1, "Parameter width should be an integer value of 2 or greater"); + + auto output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto hashes = + cudf::make_numeric_column(output_type, input.size(), cudf::mask_state::UNALLOCATED, stream, mr); + auto d_hashes = hashes->mutable_view().data(); + + auto const itr = thrust::make_counting_iterator(0); + auto const fn = minhash_fn{*d_strings, width, seed}; + thrust::transform(rmm::exec_policy(stream), itr, itr + input.size(), d_hashes, fn); + + hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count()); + + return hashes; +} + +} // namespace detail + +std::unique_ptr minhash(cudf::strings_column_view const& input, + cudf::size_type width, + cudf::hash_value_type seed, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash(input, width, seed, cudf::get_default_stream(), mr); +} + +} // namespace nvtext diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 0d58b19de6a..a8c9c41d684 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -444,6 +444,7 @@ ConfigureTest( TEXT_TEST text/bpe_tests.cpp text/edit_distance_tests.cpp + text/minhash_tests.cpp text/ngrams_tests.cpp text/ngrams_tokenize_tests.cpp text/normalize_tests.cpp diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp new file mode 100644 index 00000000000..adc67a7b57e --- /dev/null +++ b/cpp/tests/text/minhash_tests.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include + +#include + +struct MinHashTest : public cudf::test::BaseFixture { +}; + +TEST_F(MinHashTest, Basic) +{ + auto input = cudf::test::strings_column_wrapper({"doc 1", "", "this is doc 2", "", "doc 3"}, + {1, 0, 1, 1, 1}); + + auto view = cudf::strings_column_view(input); + + auto results = nvtext::minhash(view); + + auto expected = cudf::test::fixed_width_column_wrapper( + {1207251914u, 0u, 21141582u, 0u, 1207251914u}, {1, 0, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(MinHashTest, EmptyTest) +{ + auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + auto view = cudf::strings_column_view(input->view()); + auto results = nvtext::minhash(view); + EXPECT_EQ(results->size(), 0); +} + +TEST_F(MinHashTest, ErrorsTest) +{ + auto input = cudf::test::strings_column_wrapper({"pup"}); + EXPECT_THROW(nvtext::minhash(cudf::strings_column_view(input), 0), cudf::logic_error); +} From a0143676c6665798afe25434cb176b48c50e3bbd Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 16 Mar 2023 16:57:13 -0400 Subject: [PATCH 02/33] fix missing parameter name in function declaration --- cpp/include/nvtext/minhash.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index d5ab85358ac..15f5c240845 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -45,7 +45,7 @@ namespace nvtext { std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::size_type width = 4, - cudf::hash_value_type = cudf::DEFAULT_HASH_SEED, + cudf::hash_value_type seed = cudf::DEFAULT_HASH_SEED, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group From a6b40f44828d6ed73f8f5cc65397de1451a07153 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 17 Mar 2023 08:22:34 -0400 Subject: [PATCH 03/33] fix typo in doxygen comment --- cpp/include/nvtext/minhash.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 15f5c240845..61845661ef1 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -38,7 +38,7 @@ namespace nvtext { * @param width The character width used for apply substrings; * Any string smaller than this width will not be hashed. * Default is 4 characters. - * @param seed Seed value used for the Murmur32_3 hash algorithm + * @param seed Seed value used for the MurmurHash3_32 algorithm * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values for each string in input */ From 5226d7f00b9722f2f9ae401c4304f49b02d21456 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 17 Mar 2023 17:23:23 -0400 Subject: [PATCH 04/33] add cython/python interface to nvtext::minhash --- python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd | 17 +++++++++ python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 4 +-- python/cudf/cudf/_lib/nvtext/minhash.pyx | 37 ++++++++++++++++++++ python/cudf/cudf/_lib/strings/__init__.py | 3 +- python/cudf/cudf/core/column/string.py | 26 ++++++++++++++ 5 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd create mode 100644 python/cudf/cudf/_lib/nvtext/minhash.pyx diff --git a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd new file mode 100644 index 00000000000..40cffca784a --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.types cimport size_type + + +cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: + + cdef unique_ptr[column] minhash( + const column_view &strings, + size_type ngrams, + uint32_t seed + ) except + diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt index 3b925fb5548..743070b4491 100644 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources edit_distance.pyx generate_ngrams.pyx ngrams_tokenize.pyx normalize.pyx +set(cython_sources edit_distance.pyx generate_ngrams.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx new file mode 100644 index 00000000000..82be66c6a16 --- /dev/null +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -0,0 +1,37 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cudf.core.buffer import acquire_spill_lock + +from libc.stdint cimport uint32_t +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.nvtext.minhash cimport ( + minhash as cpp_minhash +) +from cudf._lib.cpp.types cimport size_type + + +@acquire_spill_lock() +def minhash(Column strings, int width, int seed=0): + + cdef column_view c_strings = strings.view() + cdef size_type c_width = width + cdef uint32_t c_seed = seed + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_minhash( + c_strings, + c_width, + c_seed + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 22a5066a20e..84d538d8e55 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -1,9 +1,10 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix from cudf._lib.nvtext.generate_ngrams import ( generate_character_ngrams, generate_ngrams, ) +from cudf._lib.nvtext.minhash import minhash from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces from cudf._lib.nvtext.replace import filter_tokens, replace_tokens diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d9a6c6c4cd6..3e7cc479b47 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5226,6 +5226,32 @@ def edit_distance_matrix(self) -> SeriesOrIndex: libstrings.edit_distance_matrix(self._column) ) + def minhash(self, n: int = 4, seed: int = 0) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + + Parameters + ---------- + n : int + The width of the substring to hash. + Default of 4 characters. + seed : int + The seed used for the hash algorithm. + Default is 0. + + Examples + -------- + >>> import cudf + >>> str_series = cudf.Series(['this is my', 'favorite book']) + >>> str_series.str.minhash() + 0 2012639418 + 1 182731933 + dtype: int32 + """ + return self._return_or_inplace( + libstrings.minhash(self._column, n, seed) + ) + def _massage_string_arg(value, name, allow_col=False): if isinstance(value, str): From 21af84736bc135475fadfc4e07f4b2217f18504a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 17 Mar 2023 17:25:49 -0400 Subject: [PATCH 05/33] fix style violations --- python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 4 ++-- python/cudf/cudf/_lib/nvtext/minhash.pyx | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt index 743070b4491..1b6ad5110c4 100644 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources edit_distance.pyx generate_ngrams.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx - replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx +set(cython_sources edit_distance.pyx generate_ngrams.pyx minhash.pyx ngrams_tokenize.pyx + normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx ) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 82be66c6a16..ae56724afb1 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -9,9 +9,7 @@ from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.nvtext.minhash cimport ( - minhash as cpp_minhash -) +from cudf._lib.cpp.nvtext.minhash cimport minhash as cpp_minhash from cudf._lib.cpp.types cimport size_type @@ -33,5 +31,3 @@ def minhash(Column strings, int width, int seed=0): ) return Column.from_unique_ptr(move(c_result)) - - From 932781344ae595c3dd8fb1ef2f98e62205988315 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 20 Mar 2023 14:46:47 -0400 Subject: [PATCH 06/33] add benchmark --- cpp/benchmarks/CMakeLists.txt | 5 ++- cpp/benchmarks/text/minhash.cpp | 57 +++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 cpp/benchmarks/text/minhash.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index cc0b642a337..5f92cb3216e 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -269,6 +269,10 @@ ConfigureBench( text/subword.cpp text/tokenize.cpp ) +ConfigureNVBench( + TEXT_NVBENCH text/minhash.cpp +) + # ################################################################################################## # * strings benchmark ------------------------------------------------------------------- ConfigureBench( @@ -285,7 +289,6 @@ ConfigureBench( string/factory.cu string/filter.cpp string/find.cpp - string/like.cpp string/repeat_strings.cpp string/replace.cpp string/replace_re.cpp diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp new file mode 100644 index 00000000000..6a215d85785 --- /dev/null +++ b/cpp/benchmarks/text/minhash.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +#include + +static void bench_minhash(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const hash_width = static_cast(state.get_int64("hash_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + auto chars_size = input.chars_size(); + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(num_rows); // all bytes are written + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = nvtext::minhash(input, hash_width); }); +} + +NVBENCH_BENCH(bench_minhash) + .set_name("minhash") + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {8, 16, 32, 64, 128}) + .add_int64_axis("hash_width", {5, 10, 25, 70, 100}); From d441b0d142731b0a8a94f8d85771e1740db5676e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 20 Mar 2023 14:47:51 -0400 Subject: [PATCH 07/33] fix style violation --- cpp/benchmarks/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 5f92cb3216e..3bbb7464eaa 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -269,9 +269,7 @@ ConfigureBench( text/subword.cpp text/tokenize.cpp ) -ConfigureNVBench( - TEXT_NVBENCH text/minhash.cpp -) +ConfigureNVBench(TEXT_NVBENCH text/minhash.cpp) # ################################################################################################## # * strings benchmark ------------------------------------------------------------------- From 8d206fa3fed22150587bba58b384b6525bc4228e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 20 Mar 2023 19:34:58 -0400 Subject: [PATCH 08/33] add doxygen group --- cpp/include/doxygen_groups.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 628d48f64cd..a31e7fc7165 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -164,6 +164,7 @@ * @defgroup nvtext_edit_distance Edit Distance * @defgroup nvtext_tokenize Tokenizing * @defgroup nvtext_replace Replacing + * @defgroup nvtext_minhash MinHashing * @} * @defgroup utility_apis Utilities * @{ From d7d947a91b1eb9f43c635f78fa800b7946888e4b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 22 Mar 2023 17:56:43 -0400 Subject: [PATCH 09/33] fix long strings issue --- cpp/benchmarks/text/minhash.cpp | 6 +++--- cpp/include/cudf/strings/string_view.cuh | 11 +++++++++-- cpp/include/cudf/strings/string_view.hpp | 4 +++- cpp/src/text/minhash.cu | 17 ++++++++++++----- cpp/tests/text/minhash_tests.cpp | 6 +++--- 5 files changed, 30 insertions(+), 14 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 6a215d85785..70d4570b0ca 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -52,6 +52,6 @@ static void bench_minhash(nvbench::state& state) NVBENCH_BENCH(bench_minhash) .set_name("minhash") - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) - .add_int64_axis("row_width", {8, 16, 32, 64, 128}) - .add_int64_axis("hash_width", {5, 10, 25, 70, 100}); + .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) + .add_int64_axis("row_width", {128, 256, 512, 1024, 2048}) + .add_int64_axis("hash_width", {5, 10, 25}); diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 29062167f11..a64b9fabb59 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -121,6 +121,13 @@ __device__ inline string_view::const_iterator::const_iterator(const string_view& { } +__device__ inline string_view::const_iterator::const_iterator(const string_view& str, + size_type pos, + size_type offset) + : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{offset} +{ +} + __device__ inline string_view::const_iterator& string_view::const_iterator::operator++() { if (byte_pos < bytes) @@ -244,7 +251,7 @@ __device__ inline string_view::const_iterator string_view::begin() const __device__ inline string_view::const_iterator string_view::end() const { - return const_iterator(*this, length()); + return const_iterator(*this, length(), size_bytes()); } // @endcond diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 265adc60392..8f7ad74ae62 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -104,10 +104,12 @@ class string_view { [[nodiscard]] __device__ inline size_type byte_offset() const; private: + friend class string_view; const char* p{}; size_type bytes{}; size_type char_pos{}; size_type byte_pos{}; + __device__ inline const_iterator(const string_view& str, size_type pos, size_type offset); /// @endcond }; diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index fc63c4a99d2..95c8678cee8 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -46,14 +46,21 @@ struct minhash_fn { __device__ cudf::hash_value_type operator()(cudf::size_type idx) const { if (d_strings.is_null(idx)) return 0; - auto const d_str = d_strings.element(idx); + auto const d_str = d_strings.element(idx); + auto const hasher = cudf::detail::MurmurHash3_32{seed}; + + if (d_str.length() <= width) return hasher(d_str); + + auto const begin = d_str.begin(); + auto const end = d_str.end() - (width - 1); auto mh = cudf::hash_value_type{0}; - for (cudf::size_type pos = 0; pos < d_str.length() - (width - 1); ++pos) { - auto const ss = d_str.substr(pos, width); - auto const hasher = cudf::detail::MurmurHash3_32{seed}; + for (auto itr = begin; itr < end; ++itr) { + auto const offset = itr.byte_offset(); + auto const ss = + cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset); auto const hvalue = hasher(ss); - // cudf::detail::hash_combine(seed, hasher(ss)); matches cudf::hash() result + // cudf::detail::hash_combine(seed, hasher(ss)); -- matches cudf::hash() result mh = mh > 0 ? cudf::detail::min(hvalue, mh) : hvalue; } diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index adc67a7b57e..dd4cce4a473 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -29,15 +29,15 @@ struct MinHashTest : public cudf::test::BaseFixture { TEST_F(MinHashTest, Basic) { - auto input = cudf::test::strings_column_wrapper({"doc 1", "", "this is doc 2", "", "doc 3"}, - {1, 0, 1, 1, 1}); + auto input = cudf::test::strings_column_wrapper({"doc 1", "", "this is doc 2", "", "doc 3", "d"}, + {1, 0, 1, 1, 1, 1}); auto view = cudf::strings_column_view(input); auto results = nvtext::minhash(view); auto expected = cudf::test::fixed_width_column_wrapper( - {1207251914u, 0u, 21141582u, 0u, 1207251914u}, {1, 0, 1, 1, 1}); + {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u}, {1, 0, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } From 9e66eec79d8900e810921684e39655830e7061dd Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 23 Mar 2023 14:53:52 -0400 Subject: [PATCH 10/33] rework as warp parallel kernel --- cpp/src/text/minhash.cu | 70 ++++++++++++++++++++------------ cpp/tests/text/minhash_tests.cpp | 13 ++++-- 2 files changed, 53 insertions(+), 30 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 95c8678cee8..91e1f9846d3 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -34,40 +35,54 @@ #include #include +#include + +#include + namespace nvtext { namespace detail { namespace { -struct minhash_fn { - cudf::column_device_view d_strings; - cudf::size_type width; - cudf::hash_value_type seed; - - __device__ cudf::hash_value_type operator()(cudf::size_type idx) const - { - if (d_strings.is_null(idx)) return 0; - auto const d_str = d_strings.element(idx); - auto const hasher = cudf::detail::MurmurHash3_32{seed}; +__global__ void minhash_fn(cudf::column_device_view d_strings, + cudf::size_type width, + cudf::hash_value_type seed, + cudf::hash_value_type* d_hashes) +{ + cudf::size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + using warp_reduce = cub::WarpReduce; + __shared__ typename warp_reduce::TempStorage temp_storage; - if (d_str.length() <= width) return hasher(d_str); + if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; } - auto const begin = d_str.begin(); - auto const end = d_str.end() - (width - 1); + auto const str_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + if (d_strings.is_null(str_idx)) { + d_hashes[str_idx] = 0; + return; + } + auto const d_str = d_strings.element(str_idx); + if (d_str.empty()) { + d_hashes[str_idx] = 0; + return; + } - auto mh = cudf::hash_value_type{0}; - for (auto itr = begin; itr < end; ++itr) { - auto const offset = itr.byte_offset(); - auto const ss = - cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset); - auto const hvalue = hasher(ss); - // cudf::detail::hash_combine(seed, hasher(ss)); -- matches cudf::hash() result + auto const hasher = cudf::detail::MurmurHash3_32{seed}; + auto const begin = d_str.begin() + lane_idx; + auto const end = (d_str.length() <= width) ? d_str.end() : d_str.end() - (width - 1); - mh = mh > 0 ? cudf::detail::min(hvalue, mh) : hvalue; - } + auto mh = std::numeric_limits::max(); + for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { + auto const offset = itr.byte_offset(); + auto const ss = cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset); + auto const hvalue = hasher(ss); + // cudf::detail::hash_combine(seed, hasher(ss)); -- matches cudf::hash() result - return mh; + mh = cudf::detail::min(hvalue, mh); } -}; + + auto const mhash = warp_reduce(temp_storage).Reduce(mh, thrust::minimum{}); + if (lane_idx == 0) { d_hashes[str_idx] = mhash; } +} } // namespace @@ -88,9 +103,10 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::make_numeric_column(output_type, input.size(), cudf::mask_state::UNALLOCATED, stream, mr); auto d_hashes = hashes->mutable_view().data(); - auto const itr = thrust::make_counting_iterator(0); - auto const fn = minhash_fn{*d_strings, width, seed}; - thrust::transform(rmm::exec_policy(stream), itr, itr + input.size(), d_hashes, fn); + constexpr int block_size = 256; + cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + minhash_fn<<>>( + *d_strings, width, seed, d_hashes); hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count()); diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index dd4cce4a473..5e217efb2de 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -29,15 +29,22 @@ struct MinHashTest : public cudf::test::BaseFixture { TEST_F(MinHashTest, Basic) { - auto input = cudf::test::strings_column_wrapper({"doc 1", "", "this is doc 2", "", "doc 3", "d"}, - {1, 0, 1, 1, 1, 1}); + auto input = + cudf::test::strings_column_wrapper({"doc 1", + "", + "this is doc 2", + "", + "doc 3", + "d", + "The quick brown fox jumpéd over the lazy brown dog."}, + {1, 0, 1, 1, 1, 1, 1}); auto view = cudf::strings_column_view(input); auto results = nvtext::minhash(view); auto expected = cudf::test::fixed_width_column_wrapper( - {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u}, {1, 0, 1, 1, 1, 1}); + {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, {1, 0, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } From c701df802c251d9d9daacebc3a6c7d9053bd5dc7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 24 Mar 2023 06:48:48 -0400 Subject: [PATCH 11/33] add multi-seed libcudf API --- cpp/include/nvtext/minhash.hpp | 26 ++++++- cpp/src/text/minhash.cu | 96 ++++++++++++++++-------- cpp/tests/text/minhash_tests.cpp | 44 +++++++++-- python/cudf/cudf/_lib/nvtext/minhash.pyx | 4 +- 4 files changed, 131 insertions(+), 39 deletions(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 61845661ef1..e73de09bde7 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -17,7 +17,9 @@ #include #include +#include #include +#include namespace nvtext { /** @@ -44,8 +46,30 @@ namespace nvtext { */ std::unique_ptr minhash( cudf::strings_column_view const& input, + cudf::numeric_scalar seed = cudf::numeric_scalar(cudf::DEFAULT_HASH_SEED), + cudf::size_type width = 4, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns the minhash values for each string per seed + * + * Hash values are computed from substrings of each string and the + * minimum hash value is returned for each string. + * + * All null row entries are ignored and the output contains all valid rows. + * + * @param input Strings column to compute minhash + * @param seeds Seed values used for the MurmurHash3_32 algorithm + * @param width The character width used for apply substrings; + * Any string smaller than this width will not be hashed. + * Default is 4 characters. + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of Minhash values for each string per seed + */ +std::unique_ptr minhash( + cudf::strings_column_view const& input, + cudf::device_span seeds, cudf::size_type width = 4, - cudf::hash_value_type seed = cudf::DEFAULT_HASH_SEED, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 91e1f9846d3..731496585d0 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -44,8 +45,8 @@ namespace detail { namespace { __global__ void minhash_fn(cudf::column_device_view d_strings, + cudf::device_span seeds, cudf::size_type width, - cudf::hash_value_type seed, cudf::hash_value_type* d_hashes) { cudf::size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); @@ -56,39 +57,42 @@ __global__ void minhash_fn(cudf::column_device_view d_strings, auto const str_idx = idx / cudf::detail::warp_size; auto const lane_idx = idx % cudf::detail::warp_size; - if (d_strings.is_null(str_idx)) { - d_hashes[str_idx] = 0; - return; - } + + if (d_strings.is_null(str_idx)) { return; } auto const d_str = d_strings.element(str_idx); - if (d_str.empty()) { - d_hashes[str_idx] = 0; - return; - } - auto const hasher = cudf::detail::MurmurHash3_32{seed}; - auto const begin = d_str.begin() + lane_idx; - auto const end = (d_str.length() <= width) ? d_str.end() : d_str.end() - (width - 1); + for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { + auto const output_idx = str_idx * seeds.size() + seed_idx; - auto mh = std::numeric_limits::max(); - for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { - auto const offset = itr.byte_offset(); - auto const ss = cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset); - auto const hvalue = hasher(ss); - // cudf::detail::hash_combine(seed, hasher(ss)); -- matches cudf::hash() result + auto const seed = seeds[seed_idx]; + auto const hasher = cudf::detail::MurmurHash3_32{seed}; - mh = cudf::detail::min(hvalue, mh); - } + auto const begin = d_str.begin() + lane_idx; + auto const end = (d_str.length() <= width) ? d_str.end() : d_str.end() - (width - 1); + + auto mh = d_str.empty() ? 0 : std::numeric_limits::max(); + for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { + auto const offset = itr.byte_offset(); + auto const ss = + cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset); - auto const mhash = warp_reduce(temp_storage).Reduce(mh, thrust::minimum{}); - if (lane_idx == 0) { d_hashes[str_idx] = mhash; } + auto const hvalue = hasher(ss); + // cudf::detail::hash_combine(seed, hasher(ss)); <-- matches cudf::hash() result + + mh = cudf::detail::min(hvalue, mh); + } + + auto const mhash = + warp_reduce(temp_storage).Reduce(mh, thrust::minimum{}); + if (lane_idx == 0) { d_hashes[output_idx] = mhash; } + } } } // namespace std::unique_ptr minhash(cudf::strings_column_view const& input, + cudf::device_span seeds, cudf::size_type width, - cudf::hash_value_type seed, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -99,29 +103,59 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - auto hashes = - cudf::make_numeric_column(output_type, input.size(), cudf::mask_state::UNALLOCATED, stream, mr); + auto hashes = cudf::make_numeric_column(output_type, + input.size() * static_cast(seeds.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); auto d_hashes = hashes->mutable_view().data(); constexpr int block_size = 256; cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; minhash_fn<<>>( - *d_strings, width, seed, d_hashes); - - hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count()); + *d_strings, seeds, width, d_hashes); - return hashes; + if (seeds.size() == 1) { + hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count()); + return hashes; + } + hashes->set_null_count(0); + + auto offsets = cudf::detail::sequence( + input.size() + 1, + cudf::numeric_scalar(0), + cudf::numeric_scalar(static_cast(seeds.size())), + stream, + mr); + return make_lists_column(input.size(), + std::move(offsets), + std::move(hashes), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + stream, + mr); } } // namespace detail std::unique_ptr minhash(cudf::strings_column_view const& input, + cudf::numeric_scalar seed, + cudf::size_type width, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + auto seeds = cudf::device_span{seed.data(), 1}; + return detail::minhash(input, seeds, width, cudf::get_default_stream(), mr); +} + +std::unique_ptr minhash(cudf::strings_column_view const& input, + cudf::device_span seeds, cudf::size_type width, - cudf::hash_value_type seed, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::minhash(input, width, seed, cudf::get_default_stream(), mr); + return detail::minhash(input, seeds, width, cudf::get_default_stream(), mr); } } // namespace nvtext diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 5e217efb2de..b0a85373a08 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -14,14 +14,20 @@ * limitations under the License. */ -#include -#include -#include - #include #include #include +#include + +#include +#include +#include +#include + +#include +#include + #include struct MinHashTest : public cudf::test::BaseFixture { @@ -48,6 +54,34 @@ TEST_F(MinHashTest, Basic) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(MinHashTest, MultiSeed) +{ + auto input = + cudf::test::strings_column_wrapper({"doc 1", + "this is doc 2", + "doc 3", + "d", + "The quick brown fox jumpéd over the lazy brown dog."}); + + auto view = cudf::strings_column_view(input); + + auto const seeds = std::vector{0, 1, 2}; + auto const d_seeds = cudf::detail::make_device_uvector_async( + seeds, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); + + auto results = nvtext::minhash(view, d_seeds); + + using LCW = cudf::test::lists_column_wrapper; + // clang-format off + LCW expected({LCW{1207251914u, 1677652962u, 1061355987u}, + LCW{ 21141582u, 580916568u, 1258052021u}, + LCW{1207251914u, 943567174u, 1109272887u}, + LCW{ 655955059u, 488346356u, 2394664816u}, + LCW{ 86520422u, 236622901u, 102546228u}}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(MinHashTest, EmptyTest) { auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); @@ -59,5 +93,5 @@ TEST_F(MinHashTest, EmptyTest) TEST_F(MinHashTest, ErrorsTest) { auto input = cudf::test::strings_column_wrapper({"pup"}); - EXPECT_THROW(nvtext::minhash(cudf::strings_column_view(input), 0), cudf::logic_error); + EXPECT_THROW(nvtext::minhash(cudf::strings_column_view(input), 0, 0), cudf::logic_error); } diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index ae56724afb1..af9e9700e3c 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -25,8 +25,8 @@ def minhash(Column strings, int width, int seed=0): c_result = move( cpp_minhash( c_strings, - c_width, - c_seed + c_seed, + c_width ) ) From 21db66b69c0049840f2287363d23cd2beb090348 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 24 Mar 2023 06:49:57 -0400 Subject: [PATCH 12/33] fix benchmark call --- cpp/benchmarks/text/minhash.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 70d4570b0ca..3ae465e1986 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -47,7 +47,7 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // all bytes are written state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { auto result = nvtext::minhash(input, hash_width); }); + [&](nvbench::launch& launch) { auto result = nvtext::minhash(input, 0, hash_width); }); } NVBENCH_BENCH(bench_minhash) From 0e3c4e36396d723625a7b483b564428b7622eff1 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 24 Mar 2023 11:31:31 -0400 Subject: [PATCH 13/33] change cython/python to use multi-seed API --- python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd | 4 +-- python/cudf/cudf/_lib/nvtext/minhash.pyx | 7 +++--- python/cudf/cudf/core/column/string.py | 26 ++++++++++++-------- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd index 40cffca784a..d7afed81826 100644 --- a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd +++ b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd @@ -12,6 +12,6 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] minhash( const column_view &strings, - size_type ngrams, - uint32_t seed + const column_view &seeds, + size_type width ) except + diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index af9e9700e3c..dd88fd132bc 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -2,7 +2,6 @@ from cudf.core.buffer import acquire_spill_lock -from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -14,18 +13,18 @@ from cudf._lib.cpp.types cimport size_type @acquire_spill_lock() -def minhash(Column strings, int width, int seed=0): +def minhash(Column strings, Column seeds, int width): cdef column_view c_strings = strings.view() cdef size_type c_width = width - cdef uint32_t c_seed = seed + cdef column_view c_seeds = seeds.view() cdef unique_ptr[column] c_result with nogil: c_result = move( cpp_minhash( c_strings, - c_seed, + c_seeds, c_width ) ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 3e7cc479b47..9dc071e1a87 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5226,30 +5226,36 @@ def edit_distance_matrix(self) -> SeriesOrIndex: libstrings.edit_distance_matrix(self._column) ) - def minhash(self, n: int = 4, seed: int = 0) -> SeriesOrIndex: + def minhash(self, seeds, n: int = 4) -> SeriesOrIndex: """ Compute the minhash of a strings column. Parameters ---------- + seeds : Series + The seeds used for the hash algorithm. + Must be of type uint32. n : int The width of the substring to hash. - Default of 4 characters. - seed : int - The seed used for the hash algorithm. - Default is 0. + Default is 4 characters. Examples -------- >>> import cudf >>> str_series = cudf.Series(['this is my', 'favorite book']) - >>> str_series.str.minhash() - 0 2012639418 - 1 182731933 - dtype: int32 + >>> seeds = cudf.Series([0], dtype=np.uint32) + >>> str_series.str.minhash(seeds) + 0 21141582 + 1 962346254 + dtype: uint32 + >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + >>> str_series.str.minhash(seeds) + 0 [21141582, 403093213, 1258052021] + 1 [962346254, 677440381, 122618762] + dtype: list """ return self._return_or_inplace( - libstrings.minhash(self._column, n, seed) + libstrings.minhash(self._column, seeds._column, n) ) From d8f06f3ae98eced92baa2f1dc4e989e5ef66686c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 24 Mar 2023 11:48:36 -0400 Subject: [PATCH 14/33] move const itr vars outside the for --- cpp/src/text/minhash.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 731496585d0..fe6ffc6b9b5 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -60,6 +60,8 @@ __global__ void minhash_fn(cudf::column_device_view d_strings, if (d_strings.is_null(str_idx)) { return; } auto const d_str = d_strings.element(str_idx); + auto const begin = d_str.begin() + lane_idx; + auto const end = (d_str.length() <= width) ? d_str.end() : d_str.end() - (width - 1); for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { auto const output_idx = str_idx * seeds.size() + seed_idx; @@ -67,9 +69,6 @@ __global__ void minhash_fn(cudf::column_device_view d_strings, auto const seed = seeds[seed_idx]; auto const hasher = cudf::detail::MurmurHash3_32{seed}; - auto const begin = d_str.begin() + lane_idx; - auto const end = (d_str.length() <= width) ? d_str.end() : d_str.end() - (width - 1); - auto mh = d_str.empty() ? 0 : std::numeric_limits::max(); for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { auto const offset = itr.byte_offset(); From 41c14a29b74a8678778ed36f6ba57f583d8751f1 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 24 Mar 2023 16:39:17 -0400 Subject: [PATCH 15/33] switch for-loops and use atomicMin --- cpp/src/text/minhash.cu | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index fe6ffc6b9b5..dd297f5a3e8 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,7 @@ namespace nvtext { namespace detail { namespace { +#if 0 __global__ void minhash_fn(cudf::column_device_view d_strings, cudf::device_span seeds, cudf::size_type width, @@ -86,6 +88,45 @@ __global__ void minhash_fn(cudf::column_device_view d_strings, if (lane_idx == 0) { d_hashes[output_idx] = mhash; } } } +#endif + +__global__ void minhash_fn(cudf::column_device_view d_strings, + cudf::device_span seeds, + cudf::size_type width, + cudf::hash_value_type* d_hashes) +{ + cudf::size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + + if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; } + + auto const str_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + + if (d_strings.is_null(str_idx)) { return; } + auto const d_str = d_strings.element(str_idx); + for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { + auto const output_idx = str_idx * seeds.size() + seed_idx; + d_hashes[output_idx] = d_str.empty() ? 0 : std::numeric_limits::max(); + } + auto const begin = d_str.begin() + lane_idx; + auto const end = (d_str.length() <= width) ? d_str.end() : d_str.end() - (width - 1); + + for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { + auto const offset = itr.byte_offset(); + auto const ss = cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset); + + for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { + auto const output_idx = str_idx * seeds.size() + seed_idx; + + auto const seed = seeds[seed_idx]; + auto const hasher = cudf::detail::MurmurHash3_32{seed}; + + auto const hvalue = hasher(ss); + // cudf::detail::hash_combine(seed, hasher(ss)); <-- matches cudf::hash() result + atomicMin(d_hashes + output_idx, hvalue); + } + } +} } // namespace From 28cf7127ce1e829f9fc5d847e82c4eb9d1051b7e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 27 Mar 2023 14:16:00 -0400 Subject: [PATCH 16/33] support seeds default parameter --- python/cudf/cudf/core/column/string.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9dc071e1a87..cd6a1517788 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5226,7 +5226,7 @@ def edit_distance_matrix(self) -> SeriesOrIndex: libstrings.edit_distance_matrix(self._column) ) - def minhash(self, seeds, n: int = 4) -> SeriesOrIndex: + def minhash(self, seeds=None, n: int = 4) -> SeriesOrIndex: """ Compute the minhash of a strings column. @@ -5254,8 +5254,12 @@ def minhash(self, seeds, n: int = 4) -> SeriesOrIndex: 1 [962346254, 677440381, 122618762] dtype: list """ + if seeds is None: + seeds = column.as_column(0, dtype=np.uint32, length=1) + else: + seeds = seeds._column return self._return_or_inplace( - libstrings.minhash(self._column, seeds._column, n) + libstrings.minhash(self._column, seeds, n) ) From 491342d3f8d052c19bd0663e61558dc7701479cd Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 27 Mar 2023 14:16:32 -0400 Subject: [PATCH 17/33] add multi-seed support to benchmark --- cpp/benchmarks/text/minhash.cpp | 32 +++++++--- cpp/src/text/minhash.cu | 109 +++++++++----------------------- 2 files changed, 53 insertions(+), 88 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 3ae465e1986..166ae6a05b2 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -23,35 +23,47 @@ #include +#include + static void bench_minhash(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); auto const hash_width = static_cast(state.get_int64("hash_width")); + auto const seed_count = static_cast(state.get_int64("seed_count")); if (static_cast(num_rows) * static_cast(row_width) >= static_cast(std::numeric_limits::max())) { state.skip("Skip benchmarks greater than size_type limit"); } - data_profile const table_profile = data_profile_builder().distribution( + data_profile const strings_profile = data_profile_builder().distribution( cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); - auto const table = - create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); - cudf::strings_column_view input(table->view().column(0)); + auto const strings_table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); + cudf::strings_column_view input(strings_table->view().column(0)); + + data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution( + cudf::type_to_id(), distribution_id::NORMAL, 0, row_width); + auto const seeds_table = create_random_table( + {cudf::type_to_id()}, row_count{seed_count}, seeds_profile); + auto seeds = seeds_table->get_column(0); + seeds.set_null_mask(rmm::device_buffer{}, 0); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); auto chars_size = input.chars_size(); - state.add_global_memory_reads(chars_size); // all bytes are read; - state.add_global_memory_writes(num_rows); // all bytes are written + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); // output are hashes - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { auto result = nvtext::minhash(input, 0, hash_width); }); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::minhash(input, seeds.view(), hash_width); + }); } NVBENCH_BENCH(bench_minhash) .set_name("minhash") .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {128, 256, 512, 1024, 2048}) - .add_int64_axis("hash_width", {5, 10, 25}); + .add_int64_axis("row_width", {128, 512, 2048}) + .add_int64_axis("hash_width", {5, 10, 25}) + .add_int64_axis("seed_count", {2, 260}); diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index dd297f5a3e8..ba1b75e6e23 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -34,10 +33,8 @@ #include #include +#include #include -#include - -#include #include @@ -45,88 +42,44 @@ namespace nvtext { namespace detail { namespace { -#if 0 -__global__ void minhash_fn(cudf::column_device_view d_strings, - cudf::device_span seeds, - cudf::size_type width, - cudf::hash_value_type* d_hashes) -{ - cudf::size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - using warp_reduce = cub::WarpReduce; - __shared__ typename warp_reduce::TempStorage temp_storage; - - if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; } +struct minhash_fn { + cudf::column_device_view d_strings; + cudf::device_span seeds; + cudf::size_type width; + cudf::hash_value_type* d_hashes; - auto const str_idx = idx / cudf::detail::warp_size; - auto const lane_idx = idx % cudf::detail::warp_size; + __device__ void operator()(cudf::size_type idx) + { + auto const str_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; - if (d_strings.is_null(str_idx)) { return; } - auto const d_str = d_strings.element(str_idx); - auto const begin = d_str.begin() + lane_idx; - auto const end = (d_str.length() <= width) ? d_str.end() : d_str.end() - (width - 1); - - for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { - auto const output_idx = str_idx * seeds.size() + seed_idx; - - auto const seed = seeds[seed_idx]; - auto const hasher = cudf::detail::MurmurHash3_32{seed}; + if (d_strings.is_null(str_idx)) { return; } + auto const d_str = d_strings.element(str_idx); + for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { + auto const output_idx = str_idx * seeds.size() + seed_idx; + d_hashes[output_idx] = d_str.empty() ? 0 : std::numeric_limits::max(); + } + auto const begin = d_str.begin() + lane_idx; + auto const end = (d_str.length() <= width) ? d_str.end() : d_str.end() - (width - 1); - auto mh = d_str.empty() ? 0 : std::numeric_limits::max(); for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { auto const offset = itr.byte_offset(); auto const ss = cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset); - auto const hvalue = hasher(ss); - // cudf::detail::hash_combine(seed, hasher(ss)); <-- matches cudf::hash() result - - mh = cudf::detail::min(hvalue, mh); - } - - auto const mhash = - warp_reduce(temp_storage).Reduce(mh, thrust::minimum{}); - if (lane_idx == 0) { d_hashes[output_idx] = mhash; } - } -} -#endif - -__global__ void minhash_fn(cudf::column_device_view d_strings, - cudf::device_span seeds, - cudf::size_type width, - cudf::hash_value_type* d_hashes) -{ - cudf::size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - - if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; } - - auto const str_idx = idx / cudf::detail::warp_size; - auto const lane_idx = idx % cudf::detail::warp_size; - - if (d_strings.is_null(str_idx)) { return; } - auto const d_str = d_strings.element(str_idx); - for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { - auto const output_idx = str_idx * seeds.size() + seed_idx; - d_hashes[output_idx] = d_str.empty() ? 0 : std::numeric_limits::max(); - } - auto const begin = d_str.begin() + lane_idx; - auto const end = (d_str.length() <= width) ? d_str.end() : d_str.end() - (width - 1); - - for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { - auto const offset = itr.byte_offset(); - auto const ss = cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset); - - for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { - auto const output_idx = str_idx * seeds.size() + seed_idx; + for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { + auto const output_idx = str_idx * seeds.size() + seed_idx; - auto const seed = seeds[seed_idx]; - auto const hasher = cudf::detail::MurmurHash3_32{seed}; + auto const seed = seeds[seed_idx]; + auto const hasher = cudf::detail::MurmurHash3_32{seed}; - auto const hvalue = hasher(ss); - // cudf::detail::hash_combine(seed, hasher(ss)); <-- matches cudf::hash() result - atomicMin(d_hashes + output_idx, hvalue); + auto const hvalue = hasher(ss); + // cudf::detail::hash_combine(seed, hasher(ss)); <-- matches cudf::hash() result + atomicMin(d_hashes + output_idx, hvalue); + } } } -} +}; } // namespace @@ -150,10 +103,10 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, mr); auto d_hashes = hashes->mutable_view().data(); - constexpr int block_size = 256; - cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; - minhash_fn<<>>( - *d_strings, seeds, width, d_hashes); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::counting_iterator(0), + input.size() * cudf::detail::warp_size, + minhash_fn{*d_strings, seeds, width, d_hashes}); if (seeds.size() == 1) { hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), From 68660eceb0711fec010b82f61b7972255a9b5bd6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 28 Mar 2023 18:34:26 -0400 Subject: [PATCH 18/33] add hash function parameter --- cpp/benchmarks/text/minhash.cpp | 2 +- cpp/include/nvtext/minhash.hpp | 15 ++++++++++++++- cpp/src/text/minhash.cu | 14 +++++++++++--- cpp/tests/text/minhash_tests.cpp | 8 ++++++-- python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd | 2 ++ python/cudf/cudf/_lib/nvtext/minhash.pyx | 9 ++++++++- python/cudf/cudf/core/column/string.py | 10 ++++++++-- 7 files changed, 50 insertions(+), 10 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 166ae6a05b2..defd65c1b2a 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -57,7 +57,7 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = nvtext::minhash(input, seeds.view(), hash_width); + auto result = nvtext::minhash(input, seeds.view(), cudf::hash_id::HASH_MURMUR3, hash_width); }); } diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index e73de09bde7..dfd036e8083 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -36,17 +36,23 @@ namespace nvtext { * * All null row entries are ignored and the output contains all valid rows. * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if h_id is not HASH_MURMUR3 + * * @param input Strings column to compute minhash + * @param h_id Hash algorithm to use; + * Only HASH_MURMUR3 is currently supported. * @param width The character width used for apply substrings; * Any string smaller than this width will not be hashed. * Default is 4 characters. - * @param seed Seed value used for the MurmurHash3_32 algorithm + * @param seed Seed value used for the MurmurHash3_32 algorithm * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values for each string in input */ std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::numeric_scalar seed = cudf::numeric_scalar(cudf::DEFAULT_HASH_SEED), + cudf::hash_id h_id = cudf::hash_id::HASH_MURMUR3, cudf::size_type width = 4, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -58,7 +64,13 @@ std::unique_ptr minhash( * * All null row entries are ignored and the output contains all valid rows. * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if h_id is not HASH_MURMUR3 + * @throw std::invalid_argument if seeds is empty + * * @param input Strings column to compute minhash + * @param h_id Hash algorithm to use; + * Only HASH_MURMUR3 is currently supported. * @param seeds Seed values used for the MurmurHash3_32 algorithm * @param width The character width used for apply substrings; * Any string smaller than this width will not be hashed. @@ -69,6 +81,7 @@ std::unique_ptr minhash( std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::device_span seeds, + cudf::hash_id h_id = cudf::hash_id::HASH_MURMUR3, cudf::size_type width = 4, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index ba1b75e6e23..2eb6e6ab46e 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -85,11 +85,17 @@ struct minhash_fn { std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::device_span seeds, + cudf::hash_id h_id, cudf::size_type width, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(width > 1, "Parameter width should be an integer value of 2 or greater"); + CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); + CUDF_EXPECTS( + width > 1, "Parameter width should be an integer value of 2 or greater", std::invalid_argument); + CUDF_EXPECTS(h_id == cudf::hash_id::HASH_MURMUR3, + "Only murmur3 hash algorithm supported", + std::invalid_argument); auto output_type = cudf::data_type{cudf::type_to_id()}; if (input.is_empty()) { return cudf::make_empty_column(output_type); } @@ -134,21 +140,23 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::numeric_scalar seed, + cudf::hash_id h_id, cudf::size_type width, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); auto seeds = cudf::device_span{seed.data(), 1}; - return detail::minhash(input, seeds, width, cudf::get_default_stream(), mr); + return detail::minhash(input, seeds, h_id, width, cudf::get_default_stream(), mr); } std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::device_span seeds, + cudf::hash_id h_id, cudf::size_type width, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::minhash(input, seeds, width, cudf::get_default_stream(), mr); + return detail::minhash(input, seeds, h_id, width, cudf::get_default_stream(), mr); } } // namespace nvtext diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index b0a85373a08..c1a80854a7c 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -92,6 +92,10 @@ TEST_F(MinHashTest, EmptyTest) TEST_F(MinHashTest, ErrorsTest) { - auto input = cudf::test::strings_column_wrapper({"pup"}); - EXPECT_THROW(nvtext::minhash(cudf::strings_column_view(input), 0, 0), cudf::logic_error); + auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); + auto view = cudf::strings_column_view(input); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::hash_id::HASH_MURMUR3, 0), std::invalid_argument); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::hash_id::HASH_MD5), std::invalid_argument); + auto seeds = cudf::device_span{}; + EXPECT_THROW(nvtext::minhash(view, seeds), std::invalid_argument); } diff --git a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd index d7afed81826..2ece3015505 100644 --- a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd +++ b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd @@ -5,6 +5,7 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.hash cimport hash_id from cudf._lib.cpp.types cimport size_type @@ -13,5 +14,6 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] minhash( const column_view &strings, const column_view &seeds, + const hash_id hash_function, size_type width ) except + diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index dd88fd132bc..68eda28e9c5 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -8,23 +8,30 @@ from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.hash cimport hash_id as cpp_hash_id from cudf._lib.cpp.nvtext.minhash cimport minhash as cpp_minhash from cudf._lib.cpp.types cimport size_type @acquire_spill_lock() -def minhash(Column strings, Column seeds, int width): +def minhash(Column strings, Column seeds, str method, int width): cdef column_view c_strings = strings.view() cdef size_type c_width = width cdef column_view c_seeds = seeds.view() cdef unique_ptr[column] c_result + cdef cpp_hash_id c_hash_function + if method == "murmur3": + c_hash_function = cpp_hash_id.HASH_MURMUR3 + else: + raise ValueError(f"Unsupported hash function: {method}") with nogil: c_result = move( cpp_minhash( c_strings, c_seeds, + c_hash_function, c_width ) ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index cd6a1517788..54c9893e94a 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5226,7 +5226,9 @@ def edit_distance_matrix(self) -> SeriesOrIndex: libstrings.edit_distance_matrix(self._column) ) - def minhash(self, seeds=None, n: int = 4) -> SeriesOrIndex: + def minhash( + self, seeds=None, n: int = 4, method: str = "murmur3" + ) -> SeriesOrIndex: """ Compute the minhash of a strings column. @@ -5238,6 +5240,10 @@ def minhash(self, seeds=None, n: int = 4) -> SeriesOrIndex: n : int The width of the substring to hash. Default is 4 characters. + method : str + Hash function to use. + Only 'murmur3' (MurmurHash3_32) is supported. + Default is 'murmur3'. Examples -------- @@ -5259,7 +5265,7 @@ def minhash(self, seeds=None, n: int = 4) -> SeriesOrIndex: else: seeds = seeds._column return self._return_or_inplace( - libstrings.minhash(self._column, seeds, n) + libstrings.minhash(self._column, seeds, method, n) ) From a07a25bde6c78c8c6438451dff4eea8280ad955a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 29 Mar 2023 13:51:14 -0400 Subject: [PATCH 19/33] fix dstr.length <= width edge case --- cpp/src/text/minhash.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 2eb6e6ab46e..306f0de2375 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -60,7 +60,11 @@ struct minhash_fn { d_hashes[output_idx] = d_str.empty() ? 0 : std::numeric_limits::max(); } auto const begin = d_str.begin() + lane_idx; - auto const end = (d_str.length() <= width) ? d_str.end() : d_str.end() - (width - 1); + auto const end = [d_str, width = width] { + auto const length = d_str.length(); + if (length > width) { return (d_str.end() - (width - 1)); } + return d_str.begin() + static_cast(length > 0); + }(); for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { auto const offset = itr.byte_offset(); From d65ad8f598470657ee1ca086971863d12eb78c64 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 29 Mar 2023 14:19:51 -0400 Subject: [PATCH 20/33] add more tests --- cpp/tests/text/minhash_tests.cpp | 10 ++++++++++ python/cudf/cudf/tests/test_text.py | 7 +++++++ 2 files changed, 17 insertions(+) diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index c1a80854a7c..7ad27c02d50 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -54,6 +54,16 @@ TEST_F(MinHashTest, Basic) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(MinHashTest, LengthEqualsWidth) +{ + auto input = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"}); + auto view = cudf::strings_column_view(input); + auto results = nvtext::minhash(view, 0, cudf::hash_id::HASH_MURMUR3, 5); + auto expected = cudf::test::fixed_width_column_wrapper( + {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(MinHashTest, MultiSeed) { auto input = diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 89c428551e4..ab460da8ee8 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -789,6 +789,13 @@ def test_is_vowel_consonant(): assert_eq(expected, actual) +def test_minhash(): + strings = cudf.Series(["this is my", "favorite book", None, ""]) + expected = cudf.Series([21141582, 962346254, None, 0], dtype=np.uint32) + actual = strings.str.minhash() + assert_eq(expected, actual) + + def test_read_text(datadir): chess_file = str(datadir) + "/chess.pgn" delimiter = "1." From 256d7e3b3e0dcae24818b3e7d0c8f2437aad2646 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 3 Apr 2023 16:12:38 -0400 Subject: [PATCH 21/33] move hash-id parameter to the end --- cpp/benchmarks/text/minhash.cpp | 2 +- cpp/include/nvtext/minhash.hpp | 14 +++++++------- cpp/src/text/minhash.cu | 10 +++++----- cpp/tests/text/minhash_tests.cpp | 6 +++--- python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd | 4 ++-- python/cudf/cudf/_lib/nvtext/minhash.pyx | 6 +++--- python/cudf/cudf/core/column/string.py | 2 +- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index defd65c1b2a..166ae6a05b2 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -57,7 +57,7 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = nvtext::minhash(input, seeds.view(), cudf::hash_id::HASH_MURMUR3, hash_width); + auto result = nvtext::minhash(input, seeds.view(), hash_width); }); } diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index dfd036e8083..315e0945164 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -40,20 +40,20 @@ namespace nvtext { * @throw std::invalid_argument if h_id is not HASH_MURMUR3 * * @param input Strings column to compute minhash - * @param h_id Hash algorithm to use; - * Only HASH_MURMUR3 is currently supported. + * @param seed Seed value used for the MurmurHash3_32 algorithm * @param width The character width used for apply substrings; * Any string smaller than this width will not be hashed. * Default is 4 characters. - * @param seed Seed value used for the MurmurHash3_32 algorithm + * @param h_id Hash algorithm to use; + * Only HASH_MURMUR3 is currently supported. * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values for each string in input */ std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::numeric_scalar seed = cudf::numeric_scalar(cudf::DEFAULT_HASH_SEED), - cudf::hash_id h_id = cudf::hash_id::HASH_MURMUR3, cudf::size_type width = 4, + cudf::hash_id h_id = cudf::hash_id::HASH_MURMUR3, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -69,20 +69,20 @@ std::unique_ptr minhash( * @throw std::invalid_argument if seeds is empty * * @param input Strings column to compute minhash - * @param h_id Hash algorithm to use; - * Only HASH_MURMUR3 is currently supported. * @param seeds Seed values used for the MurmurHash3_32 algorithm * @param width The character width used for apply substrings; * Any string smaller than this width will not be hashed. * Default is 4 characters. + * @param h_id Hash algorithm to use; + * Only HASH_MURMUR3 is currently supported. * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of Minhash values for each string per seed */ std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::device_span seeds, - cudf::hash_id h_id = cudf::hash_id::HASH_MURMUR3, cudf::size_type width = 4, + cudf::hash_id h_id = cudf::hash_id::HASH_MURMUR3, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 306f0de2375..d8a7aa168f5 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -89,8 +89,8 @@ struct minhash_fn { std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::device_span seeds, - cudf::hash_id h_id, cudf::size_type width, + cudf::hash_id h_id, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -144,23 +144,23 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::numeric_scalar seed, - cudf::hash_id h_id, cudf::size_type width, + cudf::hash_id h_id, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); auto seeds = cudf::device_span{seed.data(), 1}; - return detail::minhash(input, seeds, h_id, width, cudf::get_default_stream(), mr); + return detail::minhash(input, seeds, width, h_id, cudf::get_default_stream(), mr); } std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::device_span seeds, - cudf::hash_id h_id, cudf::size_type width, + cudf::hash_id h_id, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::minhash(input, seeds, h_id, width, cudf::get_default_stream(), mr); + return detail::minhash(input, seeds, width, h_id, cudf::get_default_stream(), mr); } } // namespace nvtext diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 7ad27c02d50..5a2da838199 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -58,7 +58,7 @@ TEST_F(MinHashTest, LengthEqualsWidth) { auto input = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"}); auto view = cudf::strings_column_view(input); - auto results = nvtext::minhash(view, 0, cudf::hash_id::HASH_MURMUR3, 5); + auto results = nvtext::minhash(view, 0, 5); auto expected = cudf::test::fixed_width_column_wrapper( {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); @@ -104,8 +104,8 @@ TEST_F(MinHashTest, ErrorsTest) { auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); - EXPECT_THROW(nvtext::minhash(view, 0, cudf::hash_id::HASH_MURMUR3, 0), std::invalid_argument); - EXPECT_THROW(nvtext::minhash(view, 0, cudf::hash_id::HASH_MD5), std::invalid_argument); + EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument); + EXPECT_THROW(nvtext::minhash(view, 0, 0, cudf::hash_id::HASH_MD5), std::invalid_argument); auto seeds = cudf::device_span{}; EXPECT_THROW(nvtext::minhash(view, seeds), std::invalid_argument); } diff --git a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd index 2ece3015505..59759fb6c31 100644 --- a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd +++ b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd @@ -14,6 +14,6 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] minhash( const column_view &strings, const column_view &seeds, - const hash_id hash_function, - size_type width + const size_type width, + const hash_id hash_function ) except + diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 68eda28e9c5..f0b2c799912 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -14,7 +14,7 @@ from cudf._lib.cpp.types cimport size_type @acquire_spill_lock() -def minhash(Column strings, Column seeds, str method, int width): +def minhash(Column strings, Column seeds, int width, str method): cdef column_view c_strings = strings.view() cdef size_type c_width = width @@ -31,8 +31,8 @@ def minhash(Column strings, Column seeds, str method, int width): cpp_minhash( c_strings, c_seeds, - c_hash_function, - c_width + c_width, + c_hash_function ) ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 54c9893e94a..845d3e5ae36 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5265,7 +5265,7 @@ def minhash( else: seeds = seeds._column return self._return_or_inplace( - libstrings.minhash(self._column, seeds, method, n) + libstrings.minhash(self._column, seeds, n, method) ) From 0423e82cb4f3fa858cce2a512b4a96b606365897 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 4 Apr 2023 17:31:11 -0400 Subject: [PATCH 22/33] fix race condition on initializing hash output --- cpp/benchmarks/text/minhash.cpp | 2 +- cpp/src/text/minhash.cu | 30 +++++++++++++++++------------ python/cudf/cudf/tests/test_text.py | 11 +++++++++++ 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 166ae6a05b2..15c39015d74 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -66,4 +66,4 @@ NVBENCH_BENCH(bench_minhash) .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) .add_int64_axis("row_width", {128, 512, 2048}) .add_int64_axis("hash_width", {5, 10, 25}) - .add_int64_axis("seed_count", {2, 260}); + .add_int64_axis("seed_count", {2, 26}); diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index d8a7aa168f5..444a15b2acd 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -54,11 +54,18 @@ struct minhash_fn { auto const lane_idx = idx % cudf::detail::warp_size; if (d_strings.is_null(str_idx)) { return; } + auto const d_str = d_strings.element(str_idx); - for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { - auto const output_idx = str_idx * seeds.size() + seed_idx; - d_hashes[output_idx] = d_str.empty() ? 0 : std::numeric_limits::max(); + + // initialize hashes output for this string + if (lane_idx == 0) { + for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { + auto const out_idx = (str_idx * seeds.size()) + seed_idx; + d_hashes[out_idx] = d_str.empty() ? 0 : std::numeric_limits::max(); + } } + __syncwarp(); + auto const begin = d_str.begin() + lane_idx; auto const end = [d_str, width = width] { auto const length = d_str.length(); @@ -66,20 +73,19 @@ struct minhash_fn { return d_str.begin() + static_cast(length > 0); }(); + // each lane hashes substrings of parts of the string for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { auto const offset = itr.byte_offset(); auto const ss = cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset); + // hashing each seed on the same section of string is 10x faster than + // re-substringing (my new word) for each seed for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { - auto const output_idx = str_idx * seeds.size() + seed_idx; - - auto const seed = seeds[seed_idx]; - auto const hasher = cudf::detail::MurmurHash3_32{seed}; - - auto const hvalue = hasher(ss); - // cudf::detail::hash_combine(seed, hasher(ss)); <-- matches cudf::hash() result - atomicMin(d_hashes + output_idx, hvalue); + auto const out_idx = (str_idx * seeds.size()) + seed_idx; + auto const hasher = cudf::detail::MurmurHash3_32{seeds[seed_idx]}; + auto const hvalue = hasher(ss); + atomicMin(d_hashes + out_idx, hvalue); } } } @@ -123,7 +129,6 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, input.null_count()); return hashes; } - hashes->set_null_count(0); auto offsets = cudf::detail::sequence( input.size() + 1, @@ -131,6 +136,7 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::numeric_scalar(static_cast(seeds.size())), stream, mr); + hashes->set_null_mask(rmm::device_buffer{}, 0); // children have no nulls return make_lists_column(input.size(), std::move(offsets), std::move(hashes), diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index ab460da8ee8..cfeab0ceada 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -794,6 +794,17 @@ def test_minhash(): expected = cudf.Series([21141582, 962346254, None, 0], dtype=np.uint32) actual = strings.str.minhash() assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + expected = cudf.Series( + [ + cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32), + cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32), + None, + cudf.Series([0, 0, 0], dtype=np.uint32), + ] + ) + actual = strings.str.minhash(seeds=seeds, n=5) + assert_eq(expected, actual) def test_read_text(datadir): From bd5d66094e5d6589e3b9f300c0326168ec630343 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 6 Apr 2023 13:16:06 -0400 Subject: [PATCH 23/33] add call to sanitize nulls --- cpp/src/text/minhash.cu | 20 +++++++++++++------- cpp/tests/text/minhash_tests.cpp | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 444a15b2acd..5b9678d17a0 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -137,13 +138,18 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, stream, mr); hashes->set_null_mask(rmm::device_buffer{}, 0); // children have no nulls - return make_lists_column(input.size(), - std::move(offsets), - std::move(hashes), - input.null_count(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), - stream, - mr); + auto result = make_lists_column(input.size(), + std::move(offsets), + std::move(hashes), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + stream, + mr); + // expect this condition to be very rare + if (input.null_count() > 0) { + result = cudf::detail::purge_nonempty_nulls(result->view(), stream, mr); + } + return result; } } // namespace detail diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 5a2da838199..6963c6fe9ba 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -92,6 +93,23 @@ TEST_F(MinHashTest, MultiSeed) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(MinHashTest, MultiSeedWithNullInputRow) +{ + auto validity = cudf::test::iterators::null_at(1); + auto input = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity); + auto view = cudf::strings_column_view(input); + + auto const seeds = std::vector{1, 2}; + auto const d_seeds = cudf::detail::make_device_uvector_async( + seeds, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); + auto results = nvtext::minhash(view, d_seeds); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(MinHashTest, EmptyTest) { auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); From a00c9596e49c210f9bf2f784236ebd28caa39b5c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 7 Apr 2023 10:49:00 -0400 Subject: [PATCH 24/33] use thrust::fill to init the output --- cpp/include/nvtext/minhash.hpp | 6 +++--- cpp/src/text/minhash.cu | 31 ++++++++++++++++++++----------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 315e0945164..b5cf0b0fb60 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -34,7 +34,7 @@ namespace nvtext { * Hash values are computed from substrings of each string and the * minimum hash value is returned for each string. * - * All null row entries are ignored and the output contains all valid rows. + * Any null row entries result in corresponding null output rows. * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if h_id is not HASH_MURMUR3 @@ -62,7 +62,7 @@ std::unique_ptr minhash( * Hash values are computed from substrings of each string and the * minimum hash value is returned for each string. * - * All null row entries are ignored and the output contains all valid rows. + * Any null row entries result in corresponding null output rows. * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if h_id is not HASH_MURMUR3 @@ -76,7 +76,7 @@ std::unique_ptr minhash( * @param h_id Hash algorithm to use; * Only HASH_MURMUR3 is currently supported. * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of Minhash values for each string per seed + * @return List column of minhash values for each string per seed */ std::unique_ptr minhash( cudf::strings_column_view const& input, diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 5b9678d17a0..e54a0bd0618 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -34,6 +34,8 @@ #include #include +#include +#include #include #include @@ -43,6 +45,12 @@ namespace nvtext { namespace detail { namespace { +/** + * @brief Compute the minhash of each string for each seed + * + * This is a warp-per-string algorithm where parallel threads within a warp + * work on substrings of a single string row. + */ struct minhash_fn { cudf::column_device_view d_strings; cudf::device_span seeds; @@ -56,14 +64,13 @@ struct minhash_fn { if (d_strings.is_null(str_idx)) { return; } - auto const d_str = d_strings.element(str_idx); + auto const d_str = d_strings.element(str_idx); + auto const d_output = d_hashes + (str_idx * seeds.size()); // initialize hashes output for this string if (lane_idx == 0) { - for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { - auto const out_idx = (str_idx * seeds.size()) + seed_idx; - d_hashes[out_idx] = d_str.empty() ? 0 : std::numeric_limits::max(); - } + auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); + thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); } __syncwarp(); @@ -77,16 +84,15 @@ struct minhash_fn { // each lane hashes substrings of parts of the string for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { auto const offset = itr.byte_offset(); - auto const ss = + auto const hash_str = cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset); // hashing each seed on the same section of string is 10x faster than // re-substringing (my new word) for each seed - for (auto seed_idx = 0; seed_idx < static_cast(seeds.size()); ++seed_idx) { - auto const out_idx = (str_idx * seeds.size()) + seed_idx; - auto const hasher = cudf::detail::MurmurHash3_32{seeds[seed_idx]}; - auto const hvalue = hasher(ss); - atomicMin(d_hashes + out_idx, hvalue); + for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { + auto const hasher = cudf::detail::MurmurHash3_32{seeds[seed_idx]}; + auto const hvalue = hasher(hash_str); + atomicMin(d_output + seed_idx, hvalue); } } } @@ -131,6 +137,7 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return hashes; } + // build the offsets for the output lists column auto offsets = cudf::detail::sequence( input.size() + 1, cudf::numeric_scalar(0), @@ -138,6 +145,8 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, stream, mr); hashes->set_null_mask(rmm::device_buffer{}, 0); // children have no nulls + + // build the lists column from the offsets and the hashes auto result = make_lists_column(input.size(), std::move(offsets), std::move(hashes), From e1920386e02e9bfb9faf2f02ba0a5f7083951705 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 7 Apr 2023 13:22:18 -0400 Subject: [PATCH 25/33] fix doxygen for multi-seed API --- cpp/include/nvtext/minhash.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index b5cf0b0fb60..e84e54022cc 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -60,7 +60,10 @@ std::unique_ptr minhash( * @brief Returns the minhash values for each string per seed * * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string. + * minimum hash value is returned for each string for each seed. + * Each row of the list column are seed results for the corresponding + * string. The order of the elements in each row match the order of + * the seeds provided in the `seeds` parameter. * * Any null row entries result in corresponding null output rows. * From 49db1e30f1253e1462529ad901891984582df87a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 7 Apr 2023 15:26:05 -0400 Subject: [PATCH 26/33] fix some comments --- cpp/include/nvtext/minhash.hpp | 12 ++++++------ cpp/src/text/minhash.cu | 21 +++++++++++---------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index e84e54022cc..6dc1b739d6e 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -44,8 +44,8 @@ namespace nvtext { * @param width The character width used for apply substrings; * Any string smaller than this width will not be hashed. * Default is 4 characters. - * @param h_id Hash algorithm to use; - * Only HASH_MURMUR3 is currently supported. + * @param hash_function Hash algorithm to use; + * Only HASH_MURMUR3 is currently supported. * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values for each string in input */ @@ -53,7 +53,7 @@ std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::numeric_scalar seed = cudf::numeric_scalar(cudf::DEFAULT_HASH_SEED), cudf::size_type width = 4, - cudf::hash_id h_id = cudf::hash_id::HASH_MURMUR3, + cudf::hash_id hash_function = cudf::hash_id::HASH_MURMUR3, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -76,8 +76,8 @@ std::unique_ptr minhash( * @param width The character width used for apply substrings; * Any string smaller than this width will not be hashed. * Default is 4 characters. - * @param h_id Hash algorithm to use; - * Only HASH_MURMUR3 is currently supported. + * @param hash_function Hash algorithm to use; + * Only HASH_MURMUR3 is currently supported. * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ @@ -85,7 +85,7 @@ std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width = 4, - cudf::hash_id h_id = cudf::hash_id::HASH_MURMUR3, + cudf::hash_id hash_function = cudf::hash_id::HASH_MURMUR3, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index e54a0bd0618..3418520216e 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -81,14 +81,14 @@ struct minhash_fn { return d_str.begin() + static_cast(length > 0); }(); - // each lane hashes substrings of parts of the string + // each lane hashes substrings of the given width for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { auto const offset = itr.byte_offset(); auto const hash_str = cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset); // hashing each seed on the same section of string is 10x faster than - // re-substringing (my new word) for each seed + // computing the substrings for each seed for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { auto const hasher = cudf::detail::MurmurHash3_32{seeds[seed_idx]}; auto const hvalue = hasher(hash_str); @@ -103,14 +103,15 @@ struct minhash_fn { std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width, - cudf::hash_id h_id, + cudf::hash_id hash_function, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); - CUDF_EXPECTS( - width > 1, "Parameter width should be an integer value of 2 or greater", std::invalid_argument); - CUDF_EXPECTS(h_id == cudf::hash_id::HASH_MURMUR3, + CUDF_EXPECTS(width >= 2, + "Parameter width should be an integer value of 2 or greater", + std::invalid_argument); + CUDF_EXPECTS(hash_function == cudf::hash_id::HASH_MURMUR3, "Only murmur3 hash algorithm supported", std::invalid_argument); @@ -166,22 +167,22 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::numeric_scalar seed, cudf::size_type width, - cudf::hash_id h_id, + cudf::hash_id hash_function, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); auto seeds = cudf::device_span{seed.data(), 1}; - return detail::minhash(input, seeds, width, h_id, cudf::get_default_stream(), mr); + return detail::minhash(input, seeds, width, hash_function, cudf::get_default_stream(), mr); } std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width, - cudf::hash_id h_id, + cudf::hash_id hash_function, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::minhash(input, seeds, width, h_id, cudf::get_default_stream(), mr); + return detail::minhash(input, seeds, width, hash_function, cudf::get_default_stream(), mr); } } // namespace nvtext From 04b92291323dc1efc5b35c847fa0aee4d790a745 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 13 Apr 2023 09:27:00 -0400 Subject: [PATCH 27/33] use Optional[cudf.Series] declaration --- python/cudf/cudf/core/column/string.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 845d3e5ae36..657b1f01679 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5227,7 +5227,10 @@ def edit_distance_matrix(self) -> SeriesOrIndex: ) def minhash( - self, seeds=None, n: int = 4, method: str = "murmur3" + self, + seeds: Optional[cudf.Series] = None, + n: int = 4, + method: str = "murmur3", ) -> SeriesOrIndex: """ Compute the minhash of a strings column. @@ -5262,6 +5265,8 @@ def minhash( """ if seeds is None: seeds = column.as_column(0, dtype=np.uint32, length=1) + elif not isinstance(seeds, cudf.Series): + raise ValueError("Must provide a Series of seeds") else: seeds = seeds._column return self._return_or_inplace( From 1bad8e21598492f070e2829200962879f3029fbc Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 13 Apr 2023 12:23:19 -0400 Subject: [PATCH 28/33] add overflow check for seeds*input-rows --- cpp/src/text/minhash.cu | 18 +++++++++++------ cpp/tests/text/minhash_tests.cpp | 27 +++++++++++++++----------- python/cudf/cudf/core/column/string.py | 12 +++++++----- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 3418520216e..73613d614f3 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -59,8 +59,8 @@ struct minhash_fn { __device__ void operator()(cudf::size_type idx) { - auto const str_idx = idx / cudf::detail::warp_size; - auto const lane_idx = idx % cudf::detail::warp_size; + auto const str_idx = static_cast(idx / cudf::detail::warp_size); + auto const lane_idx = static_cast(idx % cudf::detail::warp_size); if (d_strings.is_null(str_idx)) { return; } @@ -114,6 +114,11 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, CUDF_EXPECTS(hash_function == cudf::hash_id::HASH_MURMUR3, "Only murmur3 hash algorithm supported", std::invalid_argument); + CUDF_EXPECTS( + (static_cast(input.size()) * seeds.size()) < + static_cast(std::numeric_limits::max()), + "The number of seeds times the number of input rows must not exceed maximum of size_type", + std::invalid_argument); auto output_type = cudf::data_type{cudf::type_to_id()}; if (input.is_empty()) { return cudf::make_empty_column(output_type); } @@ -127,10 +132,11 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, mr); auto d_hashes = hashes->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::counting_iterator(0), - input.size() * cudf::detail::warp_size, - minhash_fn{*d_strings, seeds, width, d_hashes}); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(std::size_t{0}), + static_cast(input.size()) * static_cast(cudf::detail::warp_size), + minhash_fn{*d_strings, seeds, width, d_hashes}); if (seeds.size() == 1) { hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 6963c6fe9ba..282f6611bca 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -76,11 +75,9 @@ TEST_F(MinHashTest, MultiSeed) auto view = cudf::strings_column_view(input); - auto const seeds = std::vector{0, 1, 2}; - auto const d_seeds = cudf::detail::make_device_uvector_async( - seeds, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); + auto seeds = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results = nvtext::minhash(view, d_seeds); + auto results = nvtext::minhash(view, cudf::column_view(seeds)); using LCW = cudf::test::lists_column_wrapper; // clang-format off @@ -99,10 +96,8 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow) auto input = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity); auto view = cudf::strings_column_view(input); - auto const seeds = std::vector{1, 2}; - auto const d_seeds = cudf::detail::make_device_uvector_async( - seeds, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); - auto results = nvtext::minhash(view, d_seeds); + auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); + auto results = nvtext::minhash(view, cudf::column_view(seeds)); using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}}, @@ -124,6 +119,16 @@ TEST_F(MinHashTest, ErrorsTest) auto view = cudf::strings_column_view(input); EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument); EXPECT_THROW(nvtext::minhash(view, 0, 0, cudf::hash_id::HASH_MD5), std::invalid_argument); - auto seeds = cudf::device_span{}; - EXPECT_THROW(nvtext::minhash(view, seeds), std::invalid_argument); + auto seeds = cudf::test::fixed_width_column_wrapper< + cudf::hash_value_type>(); // cudf::device_span{}; + EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument); + + std::vector h_input(50000, ""); + input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end()); + view = cudf::strings_column_view(input); + + auto const zeroes = thrust::constant_iterator(0); + seeds = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument); } diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 657b1f01679..fefa7beb562 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5264,13 +5264,15 @@ def minhash( dtype: list """ if seeds is None: - seeds = column.as_column(0, dtype=np.uint32, length=1) - elif not isinstance(seeds, cudf.Series): - raise ValueError("Must provide a Series of seeds") + seeds_column = column.as_column(0, dtype=np.uint32, length=1) + elif isinstance(seeds, cudf.Series) and seeds.dtype == np.uint32: + seeds_column = seeds._column else: - seeds = seeds._column + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(seeds)}" + ) return self._return_or_inplace( - libstrings.minhash(self._column, seeds, n, method) + libstrings.minhash(self._column, seeds_column, n, method) ) From a918b6505d9f4854c75b5ceed9a7edb8aa9c8880 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 13 Apr 2023 12:30:48 -0400 Subject: [PATCH 29/33] support std::size_t for for-each-n in warp-per-string functor --- cpp/src/text/minhash.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 73613d614f3..eb3b9092185 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -57,7 +57,7 @@ struct minhash_fn { cudf::size_type width; cudf::hash_value_type* d_hashes; - __device__ void operator()(cudf::size_type idx) + __device__ void operator()(std::size_t idx) { auto const str_idx = static_cast(idx / cudf::detail::warp_size); auto const lane_idx = static_cast(idx % cudf::detail::warp_size); From 472108ba3b0a857f7c87aed622d4ed6eb2a7ef39 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 14 Apr 2023 11:27:16 -0400 Subject: [PATCH 30/33] add tests for error cases --- python/cudf/cudf/tests/test_text.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index cfeab0ceada..46ee7b58c87 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -806,6 +806,14 @@ def test_minhash(): actual = strings.str.minhash(seeds=seeds, n=5) assert_eq(expected, actual) + with pytest.raises(ValueError): + strings.str.minhash(seeds=7) + with pytest.raises(ValueError): + strings.str.minhash(seeds=seeds, method="md5") + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.int32) + strings.str.minhash(seeds=seeds) + def test_read_text(datadir): chess_file = str(datadir) + "/chess.pgn" From a82c074f58ff507bf8bb6e1d350b91137969ea0b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 17 Apr 2023 08:36:17 -0400 Subject: [PATCH 31/33] fix style violation --- cpp/tests/text/minhash_tests.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 282f6611bca..9572ccd1baf 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -30,8 +30,7 @@ #include -struct MinHashTest : public cudf::test::BaseFixture { -}; +struct MinHashTest : public cudf::test::BaseFixture {}; TEST_F(MinHashTest, Basic) { From 5975167aaf7e25076990a531f33c5fb89ba36cd9 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 18 Apr 2023 14:15:12 -0400 Subject: [PATCH 32/33] fix doxygen comments --- cpp/include/nvtext/minhash.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 6dc1b739d6e..9fdaeda0959 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -37,12 +37,11 @@ namespace nvtext { * Any null row entries result in corresponding null output rows. * * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if h_id is not HASH_MURMUR3 + * @throw std::invalid_argument if hash_function is not HASH_MURMUR3 * * @param input Strings column to compute minhash * @param seed Seed value used for the MurmurHash3_32 algorithm * @param width The character width used for apply substrings; - * Any string smaller than this width will not be hashed. * Default is 4 characters. * @param hash_function Hash algorithm to use; * Only HASH_MURMUR3 is currently supported. @@ -68,18 +67,18 @@ std::unique_ptr minhash( * Any null row entries result in corresponding null output rows. * * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if h_id is not HASH_MURMUR3 + * @throw std::invalid_argument if hash_function is not HASH_MURMUR3 * @throw std::invalid_argument if seeds is empty * * @param input Strings column to compute minhash * @param seeds Seed values used for the MurmurHash3_32 algorithm * @param width The character width used for apply substrings; - * Any string smaller than this width will not be hashed. * Default is 4 characters. * @param hash_function Hash algorithm to use; * Only HASH_MURMUR3 is currently supported. * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed + * or a hash_value_type column if only a single seed is specified */ std::unique_ptr minhash( cudf::strings_column_view const& input, From 4f1a1b0d35d89bf75d97b16dafbf46c1bb70f52f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 20 Apr 2023 10:43:16 -0400 Subject: [PATCH 33/33] remove unused cimport --- python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd index 59759fb6c31..0509083ae3b 100644 --- a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd +++ b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd @@ -1,6 +1,5 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column