From 6004d3557ffe4cd6b61b82cb992593f41ff284ba Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Aug 2023 17:49:44 -0400 Subject: [PATCH 1/8] Add minhash support for MurmurHash3_x64_128 --- cpp/include/nvtext/minhash.hpp | 82 +++++++-- cpp/src/text/minhash.cu | 168 +++++++++++------- cpp/tests/text/minhash_tests.cpp | 51 ++++-- python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd | 8 +- python/cudf/cudf/_lib/nvtext/minhash.pyx | 36 ++-- python/cudf/cudf/_lib/strings/__init__.py | 2 +- python/cudf/cudf/core/column/string.py | 45 +++-- .../cudf/cudf/tests/text/test_text_methods.py | 29 +++ 8 files changed, 307 insertions(+), 114 deletions(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index dda23a2ba5b..9023c36a349 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -36,24 +36,23 @@ namespace nvtext { * * Any null row entries result in corresponding null output rows. * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if hash_function is not HASH_MURMUR3 * * @param input Strings column to compute minhash - * @param seed Seed value used for the MurmurHash3_x86_32 algorithm + * @param seed Seed value used for the hash algorithm * @param width The character width used for apply substrings; * Default is 4 characters. - * @param hash_function Hash algorithm to use; - * Only HASH_MURMUR3 is currently supported. * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values for each string in input */ std::unique_ptr minhash( cudf::strings_column_view const& input, - cudf::numeric_scalar seed = cudf::numeric_scalar(cudf::DEFAULT_HASH_SEED), - cudf::size_type width = 4, - cudf::hash_id hash_function = cudf::hash_id::HASH_MURMUR3, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + cudf::numeric_scalar seed = 0, + cudf::size_type width = 4, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Returns the minhash values for each string per seed @@ -64,6 +63,8 @@ std::unique_ptr minhash( * string. The order of the elements in each row match the order of * the seeds provided in the `seeds` parameter. * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * * Any null row entries result in corresponding null output rows. * * @throw std::invalid_argument if the width < 2 @@ -72,20 +73,75 @@ std::unique_ptr minhash( * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash - * @param seeds Seed values used for the MurmurHash3_x86_32 algorithm + * @param seeds Seed values used for the hash algorithm * @param width The character width used for apply substrings; * Default is 4 characters. - * @param hash_function Hash algorithm to use; - * Only HASH_MURMUR3 is currently supported. * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed - * or a hash_value_type column if only a single seed is specified + * or a UINT32 type column if only a single seed is specified */ std::unique_ptr minhash( cudf::strings_column_view const& input, - cudf::device_span seeds, + cudf::device_span seeds, + cudf::size_type width = 4, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns the minhash value for each string + * + * Hash values are computed from substrings of each string and the + * minimum hash value is returned for each string. + * + * Any null row entries result in corresponding null output rows. + * + * This function uses MurmurHash3_x64_128 for the hash algorithm. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if hash_function is not HASH_MURMUR3 + * + * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm + * @param width The character width used for apply substrings; + * Default is 4 characters. + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Minhash values as UINT64 for each string in input + */ +std::unique_ptr minhash64( + cudf::strings_column_view const& input, + cudf::numeric_scalar seed = 0, + cudf::size_type width = 4, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns the minhash values for each string per seed + * + * Hash values are computed from substrings of each string and the + * minimum hash value is returned for each string for each seed. + * Each row of the list column are seed results for the corresponding + * string. The order of the elements in each row match the order of + * the seeds provided in the `seeds` parameter. + * + * This function uses MurmurHash3_x64_128 for the hash algorithm. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if hash_function is not HASH_MURMUR3 + * @throw std::invalid_argument if seeds is empty + * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param seeds Seed values used for the hash algorithm + * @param width The character width used for apply substrings; + * Default is 4 characters. + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + * or a UINT64 type column if only a single seed is specified + */ +std::unique_ptr minhash64( + cudf::strings_column_view const& input, + cudf::device_span seeds, cudf::size_type width = 4, - cudf::hash_id hash_function = cudf::hash_id::HASH_MURMUR3, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 05210b60154..342b3b85273 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -35,8 +36,6 @@ #include #include -#include -#include #include @@ -51,63 +50,85 @@ namespace { * * This is a warp-per-string algorithm where parallel threads within a warp * work on substrings of a single string row. + * + * @tparam HashFunction hash function to use on each substring + * + * @param d_strings Strings column to process + * @param seeds Seeds for hashing each string + * @param width Substring window size in characters + * @param d_hashes Minhash output values for each string */ -struct minhash_fn { - cudf::column_device_view d_strings; - cudf::device_span seeds; - cudf::size_type width; - cudf::hash_value_type* d_hashes; - - __device__ void operator()(std::size_t idx) - { - auto const str_idx = static_cast(idx / cudf::detail::warp_size); - auto const lane_idx = static_cast(idx % cudf::detail::warp_size); - - if (d_strings.is_null(str_idx)) { return; } - - auto const d_str = d_strings.element(str_idx); - auto const d_output = d_hashes + (str_idx * seeds.size()); - - // initialize hashes output for this string - if (lane_idx == 0) { - auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); - thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); - } - __syncwarp(); - - auto const begin = d_str.data() + lane_idx; - auto const end = d_str.data() + d_str.size_bytes(); - - // each lane hashes 'width' substrings of d_str - for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { - if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } - auto const check_str = // used for counting 'width' characters - cudf::string_view(itr, static_cast(thrust::distance(itr, end))); - auto const [bytes, left] = - cudf::strings::detail::bytes_to_character_position(check_str, width); - if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string - - auto const hash_str = cudf::string_view(itr, bytes); - // hashing with each seed on the same section of the string is 10x faster than - // computing the substrings for each seed - for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { - auto const hasher = - cudf::hashing::detail::MurmurHash3_x86_32{seeds[seed_idx]}; +template , + uint32_t, + uint64_t>> +__global__ void minhash_fn(cudf::column_device_view const d_strings, + cudf::device_span seeds, + cudf::size_type width, + hash_value_type* d_hashes) +{ + auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (idx >= (static_cast(d_strings.size() * + static_cast(cudf::detail::warp_size)))) { + return; + } + + auto const str_idx = static_cast(idx / cudf::detail::warp_size); + auto const lane_idx = static_cast(idx % cudf::detail::warp_size); + + if (d_strings.is_null(str_idx)) { return; } + + auto const d_str = d_strings.element(str_idx); + auto const d_output = d_hashes + (str_idx * seeds.size()); + + // initialize hashes output for this string + if (lane_idx == 0) { + auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); + thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); + } + __syncwarp(); + + auto const begin = d_str.data() + lane_idx; + auto const end = d_str.data() + d_str.size_bytes(); + + // each lane hashes 'width' substrings of d_str + for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { + if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } + auto const check_str = // used for counting 'width' characters + cudf::string_view(itr, static_cast(thrust::distance(itr, end))); + auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); + if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string + + auto const hash_str = cudf::string_view(itr, bytes); + // hashing with each seed on the same section of the string is 10x faster than + // computing the substrings for each seed + for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { + auto const hasher = HashFunction(seeds[seed_idx]); + // hash substring and store the min value + if constexpr (std::is_same_v) { auto const hvalue = hasher(hash_str); - cuda::atomic_ref ref{ - *(d_output + seed_idx)}; + cuda::atomic_ref ref{*(d_output + seed_idx)}; + ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); + } else { + auto const hvalue = thrust::get<0>(hasher(hash_str)); // just use the first uint64_t + cuda::atomic_ref ref{*(d_output + seed_idx)}; ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); } } } -}; +} } // namespace +template , + uint32_t, + uint64_t>> std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::device_span seeds, + cudf::device_span seeds, cudf::size_type width, - cudf::hash_id hash_function, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -115,15 +136,12 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, CUDF_EXPECTS(width >= 2, "Parameter width should be an integer value of 2 or greater", std::invalid_argument); - CUDF_EXPECTS(hash_function == cudf::hash_id::HASH_MURMUR3, - "Only murmur3 hash algorithm supported", - std::invalid_argument); CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < static_cast(std::numeric_limits::max()), "The number of seeds times the number of input rows exceeds the column size limit", std::overflow_error); - auto output_type = cudf::data_type{cudf::type_to_id()}; + auto output_type = cudf::data_type{cudf::type_to_id()}; if (input.is_empty()) { return cudf::make_empty_column(output_type); } auto const d_strings = cudf::column_device_view::create(input.parent(), stream); @@ -133,13 +151,12 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::mask_state::UNALLOCATED, stream, mr); - auto d_hashes = hashes->mutable_view().data(); + auto d_hashes = hashes->mutable_view().data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::counting_iterator(std::size_t{0}), - static_cast(input.size()) * static_cast(cudf::detail::warp_size), - minhash_fn{*d_strings, seeds, width, d_hashes}); + constexpr int block_size = 256; + cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + minhash_fn<<>>( + *d_strings, seeds, width, d_hashes); if (seeds.size() == 1) { hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), @@ -174,24 +191,45 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, } // namespace detail std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::numeric_scalar seed, + cudf::numeric_scalar seed, cudf::size_type width, - cudf::hash_id hash_function, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto seeds = cudf::device_span{seed.data(), 1}; - return detail::minhash(input, seeds, width, hash_function, cudf::get_default_stream(), mr); + auto seeds = cudf::device_span{seed.data(), 1}; + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + return detail::minhash(input, seeds, width, cudf::get_default_stream(), mr); } std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::device_span seeds, + cudf::device_span seeds, cudf::size_type width, - cudf::hash_id hash_function, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::minhash(input, seeds, width, hash_function, cudf::get_default_stream(), mr); + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + return detail::minhash(input, seeds, width, cudf::get_default_stream(), mr); +} + +std::unique_ptr minhash64(cudf::strings_column_view const& input, + cudf::numeric_scalar seed, + cudf::size_type width, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + auto seeds = cudf::device_span{seed.data(), 1}; + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + return detail::minhash(input, seeds, width, cudf::get_default_stream(), mr); +} + +std::unique_ptr minhash64(cudf::strings_column_view const& input, + cudf::device_span seeds, + cudf::size_type width, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + return detail::minhash(input, seeds, width, cudf::get_default_stream(), mr); } } // namespace nvtext diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index fa4e2a91600..2a99986d944 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -48,9 +48,20 @@ TEST_F(MinHashTest, Basic) auto results = nvtext::minhash(view); - auto expected = cudf::test::fixed_width_column_wrapper( + auto expected = cudf::test::fixed_width_column_wrapper( {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, {1, 0, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto results64 = nvtext::minhash64(view); + auto expected64 = cudf::test::fixed_width_column_wrapper({774489391575805754ul, + 0ul, + 3232308021562742685ul, + 0ul, + 13145552576991307582ul, + 14660046701545912182ul, + 398062025280761388ul}, + {1, 0, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } TEST_F(MinHashTest, LengthEqualsWidth) @@ -58,7 +69,7 @@ TEST_F(MinHashTest, LengthEqualsWidth) auto input = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"}); auto view = cudf::strings_column_view(input); auto results = nvtext::minhash(view, 0, 5); - auto expected = cudf::test::fixed_width_column_wrapper( + auto expected = cudf::test::fixed_width_column_wrapper( {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -74,11 +85,11 @@ TEST_F(MinHashTest, MultiSeed) auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({0, 1, 2}); + auto seeds = cudf::test::fixed_width_column_wrapper({0, 1, 2}); auto results = nvtext::minhash(view, cudf::column_view(seeds)); - using LCW = cudf::test::lists_column_wrapper; + using LCW = cudf::test::lists_column_wrapper; // clang-format off LCW expected({LCW{1207251914u, 1677652962u, 1061355987u}, LCW{ 21141582u, 580916568u, 1258052021u}, @@ -87,6 +98,20 @@ TEST_F(MinHashTest, MultiSeed) LCW{ 86520422u, 236622901u, 102546228u}}); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto seeds64 = cudf::test::fixed_width_column_wrapper({0, 1, 2}); + + auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); + + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({LCW64{ 774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul}, + LCW64{ 3232308021562742685ul, 4445611509348165860ul, 1188598072697676120ul}, + LCW64{13145552576991307582ul, 6846192680998069919ul, 1188598072697676120ul}, + LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul}, + LCW64{ 398062025280761388ul, 377720198157450084ul, 984941365662009329ul}}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } TEST_F(MinHashTest, MultiSeedWithNullInputRow) @@ -95,10 +120,10 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow) auto input = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity); auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); + auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); auto results = nvtext::minhash(view, cudf::column_view(seeds)); - using LCW = cudf::test::lists_column_wrapper; + using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); @@ -110,6 +135,8 @@ TEST_F(MinHashTest, EmptyTest) auto view = cudf::strings_column_view(input->view()); auto results = nvtext::minhash(view); EXPECT_EQ(results->size(), 0); + results = nvtext::minhash64(view); + EXPECT_EQ(results->size(), 0); } TEST_F(MinHashTest, ErrorsTest) @@ -117,15 +144,19 @@ TEST_F(MinHashTest, ErrorsTest) auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument); - EXPECT_THROW(nvtext::minhash(view, 0, 0, cudf::hash_id::HASH_MD5), std::invalid_argument); - auto seeds = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument); + auto seeds = cudf::test::fixed_width_column_wrapper(); EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument); + auto seeds64 = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument); std::vector h_input(50000, ""); input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end()); view = cudf::strings_column_view(input); - auto const zeroes = thrust::constant_iterator(0); - seeds = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + auto const zeroes = thrust::constant_iterator(0); + seeds = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error); + seeds64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error); } diff --git a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd index 0509083ae3b..08b3330953e 100644 --- a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd +++ b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd @@ -4,7 +4,6 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.hash cimport hash_id from cudf._lib.cpp.types cimport size_type @@ -14,5 +13,10 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const column_view &strings, const column_view &seeds, const size_type width, - const hash_id hash_function + ) except + + + cdef unique_ptr[column] minhash64( + const column_view &strings, + const column_view &seeds, + const size_type width, ) except + diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index f0b2c799912..6ed5ca834ee 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -8,31 +8,47 @@ from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.hash cimport hash_id as cpp_hash_id -from cudf._lib.cpp.nvtext.minhash cimport minhash as cpp_minhash +from cudf._lib.cpp.nvtext.minhash cimport ( + minhash as cpp_minhash, + minhash64 as cpp_minhash64, +) from cudf._lib.cpp.types cimport size_type @acquire_spill_lock() -def minhash(Column strings, Column seeds, int width, str method): +def minhash(Column strings, Column seeds, int width): cdef column_view c_strings = strings.view() cdef size_type c_width = width cdef column_view c_seeds = seeds.view() cdef unique_ptr[column] c_result - cdef cpp_hash_id c_hash_function - if method == "murmur3": - c_hash_function = cpp_hash_id.HASH_MURMUR3 - else: - raise ValueError(f"Unsupported hash function: {method}") with nogil: c_result = move( cpp_minhash( c_strings, c_seeds, - c_width, - c_hash_function + c_width + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + +@acquire_spill_lock() +def minhash64(Column strings, Column seeds, int width): + + cdef column_view c_strings = strings.view() + cdef size_type c_width = width + cdef column_view c_seeds = seeds.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_minhash64( + c_strings, + c_seeds, + c_width ) ) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index fa51d78b5c4..16875e4397e 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -6,7 +6,7 @@ hash_character_ngrams, ) from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import minhash +from cudf._lib.nvtext.minhash import minhash, minhash64 from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces from cudf._lib.nvtext.replace import filter_tokens, replace_tokens diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 0270351347d..91712e00903 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5292,7 +5292,7 @@ def minhash( self, seeds: Optional[cudf.Series] = None, n: int = 4, - method: str = "murmur3", + method: str = "MurmurHash3_x86_32", ) -> SeriesOrIndex: """ Compute the minhash of a strings column. @@ -5307,8 +5307,8 @@ def minhash( Default is 4 characters. method : str Hash function to use. - Only 'murmur3' (MurmurHash3_32) is supported. - Default is 'murmur3'. + Default is 'MurmurHash3_x86_32'. + Only 'MurmurHash3_x86_32' and 'MurmurHash3_x64_128' are supported. Examples -------- @@ -5324,18 +5324,37 @@ def minhash( 0 [21141582, 403093213, 1258052021] 1 [962346254, 677440381, 122618762] dtype: list + >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + >>> str_series.str.minhash(seeds, method='MurmurHash3_x64_128') + 0 [3232308021562742685, 4445611509348165860, 586435843695903598] + 1 [23008204270530356, 1281229757012344693, 153762819128779913] + dtype: list """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint32, length=1) - elif isinstance(seeds, cudf.Series) and seeds.dtype == np.uint32: - seeds_column = seeds._column - else: - raise ValueError( - f"Expecting a Series with dtype uint32, got {type(seeds)}" + if method == "MurmurHash3_x86_32": + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint32, length=1) + elif isinstance(seeds, cudf.Series) and seeds.dtype == np.uint32: + seeds_column = seeds._column + else: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.minhash(self._column, seeds_column, n) ) - return self._return_or_inplace( - libstrings.minhash(self._column, seeds_column, n, method) - ) + if method == "MurmurHash3_x64_128": + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint64, length=1) + elif isinstance(seeds, cudf.Series) and seeds.dtype == np.uint64: + seeds_column = seeds._column + else: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.minhash64(self._column, seeds_column, n) + ) + raise ValueError(f"Unsupported hash function: {method}") def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: """ diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index e1dda1ae5d1..27778604548 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -836,6 +836,7 @@ def test_is_vowel_consonant(): def test_minhash(): strings = cudf.Series(["this is my", "favorite book", None, ""]) + expected = cudf.Series([21141582, 962346254, None, 0], dtype=np.uint32) actual = strings.str.minhash() assert_eq(expected, actual) @@ -851,6 +852,31 @@ def test_minhash(): actual = strings.str.minhash(seeds=seeds, n=5) assert_eq(expected, actual) + expected = cudf.Series( + [3232308021562742685, 23008204270530356, None, 0], dtype=np.uint64 + ) + actual = strings.str.minhash(method="MurmurHash3_x64_128") + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + expected = cudf.Series( + [ + cudf.Series( + [7082801294247314046, 185949556058924788, 167570629329462454], + dtype=np.uint64, + ), + cudf.Series( + [382665377781028452, 86243762733551437, 7688750597953083512], + dtype=np.uint64, + ), + None, + cudf.Series([0, 0, 0], dtype=np.uint64), + ] + ) + actual = strings.str.minhash( + seeds=seeds, n=5, method="MurmurHash3_x64_128" + ) + assert_eq(expected, actual) + with pytest.raises(ValueError): strings.str.minhash(seeds=7) with pytest.raises(ValueError): @@ -858,6 +884,9 @@ def test_minhash(): with pytest.raises(ValueError): seeds = cudf.Series([0, 1, 2], dtype=np.int32) strings.str.minhash(seeds=seeds) + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + strings.str.minhash(seeds=seeds, method="MurmurHash3_x64_128") def test_jaccard_index(): From 35ce135c107a550b39420f0f1642f9a889d31bff Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 10 Aug 2023 12:22:19 -0400 Subject: [PATCH 2/8] add minhash64 to benchmarks --- cpp/benchmarks/text/minhash.cpp | 17 ++++++++++------- cpp/tests/text/minhash_tests.cpp | 24 +++++++++++++++++------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index bcc254575c0..232df638e82 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -30,6 +30,7 @@ static void bench_minhash(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const hash_width = static_cast(state.get_int64("hash_width")); auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const b64 = state.get_int64("htype") == 64; if (static_cast(num_rows) * static_cast(row_width) >= static_cast(std::numeric_limits::max())) { @@ -44,9 +45,9 @@ static void bench_minhash(nvbench::state& state) data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution( cudf::type_to_id(), distribution_id::NORMAL, 0, row_width); - auto const seeds_table = create_random_table( - {cudf::type_to_id()}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); + auto const seed_type = b64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); + auto seeds = seeds_table->get_column(0); seeds.set_null_mask(rmm::device_buffer{}, 0); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -56,13 +57,15 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = nvtext::minhash(input, seeds.view(), hash_width); + auto result = b64 ? nvtext::minhash64(input, seeds.view(), hash_width) + : nvtext::minhash(input, seeds.view(), hash_width); }); } NVBENCH_BENCH(bench_minhash) .set_name("minhash") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) + .add_int64_axis("num_rows", {1024, 8192, 16364, 131072}) .add_int64_axis("row_width", {128, 512, 2048}) - .add_int64_axis("hash_width", {5, 10, 25}) - .add_int64_axis("seed_count", {2, 26}); + .add_int64_axis("hash_width", {5, 10}) + .add_int64_axis("seed_count", {2, 26}) + .add_int64_axis("htype", {32, 64}); diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 2a99986d944..b1c961ec9e1 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -34,6 +34,7 @@ struct MinHashTest : public cudf::test::BaseFixture {}; TEST_F(MinHashTest, Basic) { + auto validity = cudf::test::iterators::null_at(1); auto input = cudf::test::strings_column_wrapper({"doc 1", "", @@ -42,14 +43,14 @@ TEST_F(MinHashTest, Basic) "doc 3", "d", "The quick brown fox jumpéd over the lazy brown dog."}, - {1, 0, 1, 1, 1, 1, 1}); + validity); auto view = cudf::strings_column_view(input); auto results = nvtext::minhash(view); auto expected = cudf::test::fixed_width_column_wrapper( - {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, {1, 0, 1, 1, 1, 1, 1}); + {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); auto results64 = nvtext::minhash64(view); @@ -60,7 +61,7 @@ TEST_F(MinHashTest, Basic) 13145552576991307582ul, 14660046701545912182ul, 398062025280761388ul}, - {1, 0, 1, 1, 1, 1, 1}); + validity); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } @@ -85,8 +86,7 @@ TEST_F(MinHashTest, MultiSeed) auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - + auto seeds = cudf::test::fixed_width_column_wrapper({0, 1, 2}); auto results = nvtext::minhash(view, cudf::column_view(seeds)); using LCW = cudf::test::lists_column_wrapper; @@ -99,8 +99,7 @@ TEST_F(MinHashTest, MultiSeed) // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - + auto seeds64 = cudf::test::fixed_width_column_wrapper({0, 1, 2}); auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); using LCW64 = cudf::test::lists_column_wrapper; @@ -127,6 +126,17 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow) LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); + auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); + + using LCW64 = cudf::test::lists_column_wrapper; + LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul}, + LCW64{}, + LCW64{0ul, 0ul}, + LCW64{2717781266371273264ul, 6977325820868387259ul}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } TEST_F(MinHashTest, EmptyTest) From 72b66c29749318dad2e7a36ccc2b202efb4172a9 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 10 Aug 2023 20:27:53 -0400 Subject: [PATCH 3/8] add const decl --- cpp/benchmarks/text/minhash.cpp | 10 +++++----- cpp/include/nvtext/minhash.hpp | 3 ++- cpp/src/text/minhash.cu | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 232df638e82..1b60caa24de 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -30,7 +30,7 @@ static void bench_minhash(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const hash_width = static_cast(state.get_int64("hash_width")); auto const seed_count = static_cast(state.get_int64("seed_count")); - auto const b64 = state.get_int64("htype") == 64; + auto const base64 = state.get_int64("hash_type") == 64; if (static_cast(num_rows) * static_cast(row_width) >= static_cast(std::numeric_limits::max())) { @@ -45,7 +45,7 @@ static void bench_minhash(nvbench::state& state) data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution( cudf::type_to_id(), distribution_id::NORMAL, 0, row_width); - auto const seed_type = b64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); auto seeds = seeds_table->get_column(0); seeds.set_null_mask(rmm::device_buffer{}, 0); @@ -57,8 +57,8 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = b64 ? nvtext::minhash64(input, seeds.view(), hash_width) - : nvtext::minhash(input, seeds.view(), hash_width); + auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width) + : nvtext::minhash(input, seeds.view(), hash_width); }); } @@ -68,4 +68,4 @@ NVBENCH_BENCH(bench_minhash) .add_int64_axis("row_width", {128, 512, 2048}) .add_int64_axis("hash_width", {5, 10}) .add_int64_axis("seed_count", {2, 26}) - .add_int64_axis("htype", {32, 64}); + .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 9023c36a349..8d05f5a0165 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -95,9 +95,10 @@ std::unique_ptr minhash( * Any null row entries result in corresponding null output rows. * * This function uses MurmurHash3_x64_128 for the hash algorithm. + * The hash function returns 2 uint64 values but only the first value + * is used with the minhash calculation. * * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if hash_function is not HASH_MURMUR3 * * @param input Strings column to compute minhash * @param seed Seed value used for the hash algorithm diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 342b3b85273..f6ce5562c5c 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -92,7 +92,7 @@ __global__ void minhash_fn(cudf::column_device_view const d_strings, auto const begin = d_str.data() + lane_idx; auto const end = d_str.data() + d_str.size_bytes(); - // each lane hashes 'width' substrings of d_str + // each lane hashes 'width' substrings of d_str for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } auto const check_str = // used for counting 'width' characters @@ -141,7 +141,7 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, "The number of seeds times the number of input rows exceeds the column size limit", std::overflow_error); - auto output_type = cudf::data_type{cudf::type_to_id()}; + auto const output_type = cudf::data_type{cudf::type_to_id()}; if (input.is_empty()) { return cudf::make_empty_column(output_type); } auto const d_strings = cudf::column_device_view::create(input.parent(), stream); From 0ecc9c6b787094fc274fd4d0e605bb6a795c0f14 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 11 Aug 2023 11:52:49 -0400 Subject: [PATCH 4/8] add comment about using only the first value from murmurhash3_x64_128 --- cpp/src/text/minhash.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index f6ce5562c5c..d979e2dfcfa 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -111,7 +111,9 @@ __global__ void minhash_fn(cudf::column_device_view const d_strings, cuda::atomic_ref ref{*(d_output + seed_idx)}; ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); } else { - auto const hvalue = thrust::get<0>(hasher(hash_str)); // just use the first uint64_t + // This code path assumes the murmurhash3_x64_128 which produces 2 uint64 values + // but only uses the first uint64 value as per requested by the LLM team. + auto const hvalue = thrust::get<0>(hasher(hash_str)); cuda::atomic_ref ref{*(d_output + seed_idx)}; ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); } From 180c619197e2d8dee20b9915a1070ed65fda741f Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 11 Aug 2023 11:52:51 -0500 Subject: [PATCH 5/8] Remove errors about hash_function. --- cpp/include/nvtext/minhash.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 8d05f5a0165..d070b299aba 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -39,7 +39,6 @@ namespace nvtext { * This function uses MurmurHash3_x86_32 for the hash algorithm. * * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if hash_function is not HASH_MURMUR3 * * @param input Strings column to compute minhash * @param seed Seed value used for the hash algorithm @@ -68,7 +67,6 @@ std::unique_ptr minhash( * Any null row entries result in corresponding null output rows. * * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if hash_function is not HASH_MURMUR3 * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit * @@ -127,7 +125,6 @@ std::unique_ptr minhash64( * Any null row entries result in corresponding null output rows. * * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if hash_function is not HASH_MURMUR3 * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit * From 706b1e4c451a3e1a94bc4da3631c1406c8bad80d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 11 Aug 2023 17:23:37 -0400 Subject: [PATCH 6/8] rework multi-seed to always return list --- cpp/include/nvtext/minhash.hpp | 10 +- cpp/src/text/minhash.cu | 132 ++++++++++++------ python/cudf/cudf/core/column/string.py | 83 ++++++----- .../cudf/cudf/tests/text/test_text_methods.py | 31 ++-- 4 files changed, 164 insertions(+), 92 deletions(-) diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 8d05f5a0165..8109f7c7842 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -45,6 +45,7 @@ namespace nvtext { * @param seed Seed value used for the hash algorithm * @param width The character width used for apply substrings; * Default is 4 characters. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values for each string in input */ @@ -52,6 +53,7 @@ std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::numeric_scalar seed = 0, cudf::size_type width = 4, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -76,14 +78,15 @@ std::unique_ptr minhash( * @param seeds Seed values used for the hash algorithm * @param width The character width used for apply substrings; * Default is 4 characters. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed - * or a UINT32 type column if only a single seed is specified */ std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width = 4, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -104,6 +107,7 @@ std::unique_ptr minhash( * @param seed Seed value used for the hash algorithm * @param width The character width used for apply substrings; * Default is 4 characters. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values as UINT64 for each string in input */ @@ -111,6 +115,7 @@ std::unique_ptr minhash64( cudf::strings_column_view const& input, cudf::numeric_scalar seed = 0, cudf::size_type width = 4, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -135,14 +140,15 @@ std::unique_ptr minhash64( * @param seeds Seed values used for the hash algorithm * @param width The character width used for apply substrings; * Default is 4 characters. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed - * or a UINT64 type column if only a single seed is specified */ std::unique_ptr minhash64( cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width = 4, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index d979e2dfcfa..dc1f49a04a2 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -58,15 +58,14 @@ namespace { * @param width Substring window size in characters * @param d_hashes Minhash output values for each string */ -template , - uint32_t, - uint64_t>> -__global__ void minhash_fn(cudf::column_device_view const d_strings, - cudf::device_span seeds, - cudf::size_type width, - hash_value_type* d_hashes) +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +__global__ void minhash_kernel(cudf::column_device_view const d_strings, + cudf::device_span seeds, + cudf::size_type width, + hash_value_type* d_hashes) { auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (idx >= (static_cast(d_strings.size() * @@ -111,8 +110,8 @@ __global__ void minhash_fn(cudf::column_device_view const d_strings, cuda::atomic_ref ref{*(d_output + seed_idx)}; ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); } else { - // This code path assumes the murmurhash3_x64_128 which produces 2 uint64 values - // but only uses the first uint64 value as per requested by the LLM team. + // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values + // but only uses the first uint64 value as requested by the LLM team. auto const hvalue = thrust::get<0>(hasher(hash_str)); cuda::atomic_ref ref{*(d_output + seed_idx)}; ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); @@ -121,18 +120,15 @@ __global__ void minhash_fn(cudf::column_device_view const d_strings, } } -} // namespace - -template , - uint32_t, - uint64_t>> -std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +std::unique_ptr minhash_fn(cudf::strings_column_view const& input, + cudf::device_span seeds, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); CUDF_EXPECTS(width >= 2, @@ -157,22 +153,22 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, constexpr int block_size = 256; cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; - minhash_fn<<>>( + minhash_kernel<<>>( *d_strings, seeds, width, d_hashes); - if (seeds.size() == 1) { - hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), - input.null_count()); - return hashes; - } + return hashes; +} +std::unique_ptr build_list_result(cudf::strings_column_view const& input, + std::unique_ptr&& hashes, + cudf::size_type seeds_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ // build the offsets for the output lists column - auto offsets = cudf::detail::sequence( - input.size() + 1, - cudf::numeric_scalar(0), - cudf::numeric_scalar(static_cast(seeds.size())), - stream, - mr); + auto const zero = cudf::numeric_scalar(0); + auto const size = cudf::numeric_scalar(static_cast(seeds_size)); + auto offsets = cudf::detail::sequence(input.size() + 1, zero, size, stream, mr); hashes->set_null_mask(rmm::device_buffer{}, 0); // children have no nulls // build the lists column from the offsets and the hashes @@ -189,49 +185,95 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, } return result; } +} // namespace +std::unique_ptr minhash(cudf::strings_column_view const& input, + cudf::numeric_scalar seed, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto const seeds = cudf::device_span{seed.data(), 1}; + auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); + hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count()); + return hashes; +} + +std::unique_ptr minhash(cudf::strings_column_view const& input, + cudf::device_span seeds, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); + return build_list_result(input, std::move(hashes), seeds.size(), stream, mr); +} + +std::unique_ptr minhash64(cudf::strings_column_view const& input, + cudf::numeric_scalar seed, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto const seeds = cudf::device_span{seed.data(), 1}; + auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); + hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count()); + return hashes; +} + +std::unique_ptr minhash64(cudf::strings_column_view const& input, + cudf::device_span seeds, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); + return build_list_result(input, std::move(hashes), seeds.size(), stream, mr); +} } // namespace detail std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::numeric_scalar seed, cudf::size_type width, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto seeds = cudf::device_span{seed.data(), 1}; - using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - return detail::minhash(input, seeds, width, cudf::get_default_stream(), mr); + return detail::minhash(input, seed, width, stream, mr); } std::unique_ptr minhash(cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - return detail::minhash(input, seeds, width, cudf::get_default_stream(), mr); + return detail::minhash(input, seeds, width, stream, mr); } std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar seed, cudf::size_type width, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto seeds = cudf::device_span{seed.data(), 1}; - using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - return detail::minhash(input, seeds, width, cudf::get_default_stream(), mr); + return detail::minhash64(input, seed, width, stream, mr); } std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - return detail::minhash(input, seeds, width, cudf::get_default_stream(), mr); + return detail::minhash64(input, seeds, width, stream, mr); } } // namespace nvtext diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index e66ce53a586..4e7120204c2 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5287,26 +5287,20 @@ def edit_distance_matrix(self) -> SeriesOrIndex: ) def minhash( - self, - seeds: Optional[cudf.Series] = None, - n: int = 4, - method: str = "MurmurHash3_x86_32", + self, seeds: Optional[ColumnLike] = None, width: int = 4 ) -> SeriesOrIndex: """ Compute the minhash of a strings column. + This uses the MurmurHash3_x86_32 algorithm for the hash function. Parameters ---------- - seeds : Series + seeds : ColumnLike The seeds used for the hash algorithm. Must be of type uint32. - n : int + width : int The width of the substring to hash. Default is 4 characters. - method : str - Hash function to use. - Default is 'MurmurHash3_x86_32'. - Only 'MurmurHash3_x86_32' and 'MurmurHash3_x64_128' are supported. Examples -------- @@ -5314,45 +5308,66 @@ def minhash( >>> str_series = cudf.Series(['this is my', 'favorite book']) >>> seeds = cudf.Series([0], dtype=np.uint32) >>> str_series.str.minhash(seeds) - 0 21141582 - 1 962346254 + 0 [21141582] + 1 [962346254] dtype: uint32 >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) >>> str_series.str.minhash(seeds) 0 [21141582, 403093213, 1258052021] 1 [962346254, 677440381, 122618762] dtype: list + """ + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint32, length=1) + else: + seeds_column = column.as_column(seeds) + if seeds_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.minhash(self._column, seeds_column, width) + ) + + def minhash64( + self, seeds: Optional[ColumnLike] = None, width: int = 4 + ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. + This function generates 2 uint64 values but only the first + uint64 value is used. + + Parameters + ---------- + seeds : ColumnLike + The seeds used for the hash algorithm. + Must be of type uint64. + width : int + The width of the substring to hash. + Default is 4 characters. + + Examples + -------- + >>> import cudf + >>> str_series = cudf.Series(['this is my', 'favorite book']) >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) >>> str_series.str.minhash(seeds, method='MurmurHash3_x64_128') 0 [3232308021562742685, 4445611509348165860, 586435843695903598] 1 [23008204270530356, 1281229757012344693, 153762819128779913] dtype: list """ - if method == "MurmurHash3_x86_32": - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint32, length=1) - elif isinstance(seeds, cudf.Series) and seeds.dtype == np.uint32: - seeds_column = seeds._column - else: - raise ValueError( - f"Expecting a Series with dtype uint32, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.minhash(self._column, seeds_column, n) - ) - if method == "MurmurHash3_x64_128": - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint64, length=1) - elif isinstance(seeds, cudf.Series) and seeds.dtype == np.uint64: - seeds_column = seeds._column - else: + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint64, length=1) + else: + seeds_column = column.as_column(seeds) + if seeds_column.dtype != np.uint64: raise ValueError( f"Expecting a Series with dtype uint64, got {type(seeds)}" ) - return self._return_or_inplace( - libstrings.minhash64(self._column, seeds_column, n) - ) - raise ValueError(f"Unsupported hash function: {method}") + return self._return_or_inplace( + libstrings.minhash64(self._column, seeds_column, width) + ) def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: """ diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 27778604548..8cda15e4acc 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -837,7 +837,14 @@ def test_is_vowel_consonant(): def test_minhash(): strings = cudf.Series(["this is my", "favorite book", None, ""]) - expected = cudf.Series([21141582, 962346254, None, 0], dtype=np.uint32) + expected = cudf.Series( + [ + cudf.Series([21141582], dtype=np.uint32), + cudf.Series([962346254], dtype=np.uint32), + None, + cudf.Series([0], dtype=np.uint32), + ] + ) actual = strings.str.minhash() assert_eq(expected, actual) seeds = cudf.Series([0, 1, 2], dtype=np.uint32) @@ -849,13 +856,18 @@ def test_minhash(): cudf.Series([0, 0, 0], dtype=np.uint32), ] ) - actual = strings.str.minhash(seeds=seeds, n=5) + actual = strings.str.minhash(seeds=seeds, width=5) assert_eq(expected, actual) expected = cudf.Series( - [3232308021562742685, 23008204270530356, None, 0], dtype=np.uint64 + [ + cudf.Series([3232308021562742685], dtype=np.uint64), + cudf.Series([23008204270530356], dtype=np.uint64), + None, + cudf.Series([0], dtype=np.uint64), + ] ) - actual = strings.str.minhash(method="MurmurHash3_x64_128") + actual = strings.str.minhash64() assert_eq(expected, actual) seeds = cudf.Series([0, 1, 2], dtype=np.uint64) expected = cudf.Series( @@ -872,21 +884,18 @@ def test_minhash(): cudf.Series([0, 0, 0], dtype=np.uint64), ] ) - actual = strings.str.minhash( - seeds=seeds, n=5, method="MurmurHash3_x64_128" - ) + actual = strings.str.minhash64(seeds=seeds, width=5) assert_eq(expected, actual) + # test wrong seed types with pytest.raises(ValueError): - strings.str.minhash(seeds=7) - with pytest.raises(ValueError): - strings.str.minhash(seeds=seeds, method="md5") + strings.str.minhash(seeds="a") with pytest.raises(ValueError): seeds = cudf.Series([0, 1, 2], dtype=np.int32) strings.str.minhash(seeds=seeds) with pytest.raises(ValueError): seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - strings.str.minhash(seeds=seeds, method="MurmurHash3_x64_128") + strings.str.minhash64(seeds=seeds) def test_jaccard_index(): From df38760f2b7a46e60d61ece891738afe472199f5 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 15 Aug 2023 20:23:00 -0400 Subject: [PATCH 7/8] fix incorrect comments --- python/cudf/cudf/core/column/string.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4e7120204c2..fe21dc87bac 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5310,7 +5310,7 @@ def minhash( >>> str_series.str.minhash(seeds) 0 [21141582] 1 [962346254] - dtype: uint32 + dtype: list >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) >>> str_series.str.minhash(seeds) 0 [21141582, 403093213, 1258052021] @@ -5352,7 +5352,7 @@ def minhash64( >>> import cudf >>> str_series = cudf.Series(['this is my', 'favorite book']) >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - >>> str_series.str.minhash(seeds, method='MurmurHash3_x64_128') + >>> str_series.str.minhash64(seeds) 0 [3232308021562742685, 4445611509348165860, 586435843695903598] 1 [23008204270530356, 1281229757012344693, 153762819128779913] dtype: list From b347eca071a5a964c14e8c75508b826938c682c6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 18 Aug 2023 19:31:17 -0400 Subject: [PATCH 8/8] fix some castings --- cpp/src/text/minhash.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index dc1f49a04a2..f06eaa5b52c 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -68,8 +68,8 @@ __global__ void minhash_kernel(cudf::column_device_view const d_strings, hash_value_type* d_hashes) { auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (idx >= (static_cast(d_strings.size() * - static_cast(cudf::detail::warp_size)))) { + if (idx >= (static_cast(d_strings.size()) * + static_cast(cudf::detail::warp_size))) { return; } @@ -167,7 +167,7 @@ std::unique_ptr build_list_result(cudf::strings_column_view const& { // build the offsets for the output lists column auto const zero = cudf::numeric_scalar(0); - auto const size = cudf::numeric_scalar(static_cast(seeds_size)); + auto const size = cudf::numeric_scalar(seeds_size); auto offsets = cudf::detail::sequence(input.size() + 1, zero, size, stream, mr); hashes->set_null_mask(rmm::device_buffer{}, 0); // children have no nulls