Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Word-based nvtext::minhash function #15368

Merged
merged 82 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
82 commits
Select commit Hold shift + click to select a range
4a8ad7f
Word-based nvtext::minhash function
davidwendt Mar 21, 2024
cebcdf0
Merge branch 'branch-24.06' into word-minhash
davidwendt Mar 21, 2024
462130c
Merge branch 'branch-24.06' into word-minhash
davidwendt Mar 26, 2024
5e5b118
Merge branch 'branch-24.06' into word-minhash
davidwendt Mar 26, 2024
7cdff59
Merge branch 'branch-24.06' into word-minhash
davidwendt Apr 11, 2024
d65cd23
Merge branch 'branch-24.06' into word-minhash
davidwendt May 2, 2024
668464a
Merge branch 'branch-24.06' into word-minhash
davidwendt May 2, 2024
2dc94c1
Merge branch 'branch-24.06' into word-minhash
davidwendt May 6, 2024
76a7f1a
Merge branch 'branch-24.06' into word-minhash
davidwendt May 8, 2024
093d721
Merge branch 'branch-24.06' into word-minhash
davidwendt May 16, 2024
5a8128e
Merge branch 'branch-24.06' into word-minhash
davidwendt May 21, 2024
0c95eb7
Merge branch 'branch-24.08' into word-minhash
davidwendt May 21, 2024
9271569
Merge branch 'branch-24.08' into word-minhash
davidwendt May 21, 2024
2ce558d
Merge branch 'branch-24.08' into word-minhash
davidwendt May 23, 2024
cc174df
Merge branch 'branch-24.08' into word-minhash
davidwendt May 24, 2024
79d3a33
Merge branch 'branch-24.08' into word-minhash
davidwendt May 24, 2024
9eb323e
Merge branch 'branch-24.08' into word-minhash
davidwendt May 28, 2024
950e6c2
Merge branch 'branch-24.08' into word-minhash
davidwendt May 31, 2024
b7c467d
Merge branch 'branch-24.08' into word-minhash
davidwendt Jun 24, 2024
f98750f
Merge branch 'branch-24.08' into word-minhash
davidwendt Jun 25, 2024
9cbaa2b
Merge branch 'branch-24.08' into word-minhash
davidwendt Jun 27, 2024
b30a6fc
Merge branch 'branch-24.08' into word-minhash
davidwendt Jul 1, 2024
b4b4cb6
Merge branch 'branch-24.08' into word-minhash
davidwendt Jul 2, 2024
689d36e
Merge branch 'branch-24.08' into word-minhash
davidwendt Jul 3, 2024
3b14436
Merge branch 'branch-24.08' into word-minhash
davidwendt Jul 9, 2024
c674e62
Merge branch 'branch-24.08' into word-minhash
davidwendt Jul 10, 2024
d94d209
Merge branch 'branch-24.08' into word-minhash
davidwendt Jul 11, 2024
d6ab482
Merge branch 'branch-24.08' into word-minhash
davidwendt Jul 18, 2024
76c56b6
change name to word_minhash
davidwendt Jul 18, 2024
b0c682d
add python/cython interface
davidwendt Jul 18, 2024
91c8134
Merge branch 'branch-24.08' into word-minhash
davidwendt Jul 18, 2024
e00db60
undo extra string.py formatting
davidwendt Jul 18, 2024
dee9d83
add pytest for word_minhash
davidwendt Jul 18, 2024
596ca0b
Merge branch 'branch-24.08' into word-minhash
davidwendt Jul 18, 2024
a56ef87
Merge branch 'branch-24.08' into word-minhash
davidwendt Jul 18, 2024
4a6960f
Merge branch 'branch-24.08' into word-minhash
davidwendt Jul 19, 2024
a97c347
add benchmark
davidwendt Jul 19, 2024
140623e
Merge branch 'word-minhash' of github.com:davidwendt/cudf into word-m…
davidwendt Jul 22, 2024
b6f787d
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 22, 2024
a776913
Merge branch 'branch-24.08' into word-minhash
davidwendt Jul 23, 2024
04a9380
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 23, 2024
9a361e1
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 23, 2024
89f17eb
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 23, 2024
59095a6
Merge branch 'word-minhash' of github.com:davidwendt/cudf into word-m…
davidwendt Jul 24, 2024
621d291
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 24, 2024
2710f97
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 25, 2024
42a4849
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 25, 2024
6b32911
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 26, 2024
e5e3512
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 30, 2024
cc5f272
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 30, 2024
5da0936
add example to word-minhash python doc
davidwendt Jul 31, 2024
09a0109
fix thread index variable types
davidwendt Jul 31, 2024
645dc76
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 31, 2024
b42824f
use no_validity() for seeds profile in benchmark
davidwendt Jul 31, 2024
529843e
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 31, 2024
46d6e9e
Merge branch 'word-minhash' of github.com:davidwendt/cudf into word-m…
davidwendt Jul 31, 2024
01ff418
Merge branch 'branch-24.10' into word-minhash
davidwendt Jul 31, 2024
8f11ddd
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 1, 2024
4eb2700
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 1, 2024
744c612
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 2, 2024
b6c1812
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 5, 2024
0a6a595
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 6, 2024
f17eaa9
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 9, 2024
9569242
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 14, 2024
8209037
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 16, 2024
55ecde5
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 19, 2024
a99b23c
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 19, 2024
23d243b
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 19, 2024
bcc2923
Merge branch 'word-minhash' of github.com:davidwendt/cudf into word-m…
davidwendt Aug 19, 2024
deed806
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 19, 2024
e0dd76f
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 20, 2024
eddc771
Merge branch 'word-minhash' of github.com:davidwendt/cudf into word-m…
davidwendt Aug 20, 2024
f9cfa04
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 23, 2024
7a13299
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 27, 2024
d33e3d9
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 28, 2024
12796f6
Merge branch 'branch-24.10' into word-minhash
davidwendt Aug 29, 2024
1acf329
Merge branch 'branch-24.10' into word-minhash
davidwendt Sep 3, 2024
b45ada6
refactor atomic call
davidwendt Sep 3, 2024
4de2183
Merge branch 'branch-24.10' into word-minhash
davidwendt Sep 10, 2024
a34752a
fix doxygen
davidwendt Sep 10, 2024
0bb3e8e
Merge branch 'branch-24.10' into word-minhash
davidwendt Sep 11, 2024
cedc597
fix memory-resource parameter
davidwendt Sep 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)

ConfigureNVBench(
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
)

# ##################################################################################################
Expand Down
77 changes: 77 additions & 0 deletions cpp/benchmarks/text/word_minhash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/filling.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <nvtext/minhash.hpp>

#include <rmm/device_buffer.hpp>

#include <nvbench/nvbench.cuh>

static void bench_word_minhash(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
auto const base64 = state.get_int64("hash_type") == 64;

data_profile const strings_profile =
data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
auto strings_table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);

auto const num_offsets = (num_rows / row_width) + 1;
auto offsets = cudf::sequence(num_offsets,
cudf::numeric_scalar<cudf::size_type>(0),
cudf::numeric_scalar<cudf::size_type>(row_width));

auto source = cudf::make_lists_column(num_offsets - 1,
std::move(offsets),
std::move(strings_table->release().front()),
0,
rmm::device_buffer{});

data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
auto seeds = seeds_table->get_column(0);

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));

cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
auto chars_size = input.chars_size(cudf::get_default_stream());
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
state.add_global_memory_writes<nvbench::int32_t>(num_rows); // output are hashes

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
: nvtext::word_minhash(source->view(), seeds.view());
});
}

NVBENCH_BENCH(bench_word_minhash)
.set_name("word_minhash")
.add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
.add_int64_axis("row_width", {10, 100, 1000})
.add_int64_axis("seed_count", {2, 25})
.add_int64_axis("hash_type", {32, 64});
61 changes: 59 additions & 2 deletions cpp/include/nvtext/minhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <cudf/column/column.hpp>
#include <cudf/hashing.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/export.hpp>
Expand Down Expand Up @@ -72,7 +73,7 @@ std::unique_ptr<cudf::column> minhash(
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Strings column to compute minhash
* @param seeds Seed values used for the hash algorithm
Expand Down Expand Up @@ -133,7 +134,7 @@ std::unique_ptr<cudf::column> minhash64(
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Strings column to compute minhash
* @param seeds Seed values used for the hash algorithm
Expand All @@ -150,5 +151,61 @@ std::unique_ptr<cudf::column> minhash64(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Returns the minhash values for each row of strings per seed
*
* Hash values are computed from each string in each row and the
* minimum hash value is returned for each row for each seed.
* Each row of the output list column are seed results for the corresponding
* input row. The order of the elements in each row match the order of
* the seeds provided in the `seeds` parameter.
*
* This function uses MurmurHash3_x86_32 for the hash algorithm.
*
* Any null row entries result in corresponding null output rows.
*
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Lists column of strings to compute minhash
* @param seeds Seed values used for the hash algorithm
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> word_minhash(
cudf::lists_column_view const& input,
cudf::device_span<uint32_t const> seeds,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Returns the minhash values for each row of strings per seed
*
* Hash values are computed from each string in each row and the
* minimum hash value is returned for each row for each seed.
* Each row of the output list column are seed results for the corresponding
* input row. The order of the elements in each row match the order of
* the seeds provided in the `seeds` parameter.
*
* This function uses MurmurHash3_x64_128 for the hash algorithm though
* only the first 64-bits of the hash are used in computing the output.
*
* Any null row entries result in corresponding null output rows.
*
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Lists column of strings to compute minhash
* @param seeds Seed values used for the hash algorithm
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> word_minhash64(
cudf::lists_column_view const& input,
cudf::device_span<uint64_t const> seeds,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
/** @} */ // end of group
} // namespace CUDF_EXPORT nvtext
147 changes: 141 additions & 6 deletions cpp/src/text/minhash.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include <cudf/hashing/detail/hashing.hpp>
#include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/lists/list_device_view.cuh>
#include <cudf/lists/lists_column_device_view.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
Expand Down Expand Up @@ -151,15 +153,111 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
mr);
auto d_hashes = hashes->mutable_view().data<hash_value_type>();

constexpr int block_size = 256;
cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
constexpr cudf::thread_index_type block_size = 256;
cudf::detail::grid_1d grid{
static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
*d_strings, seeds, width, d_hashes);

return hashes;
}

std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const& input,
/**
* @brief Compute the minhash of each list row of strings for each seed
*
* This is a warp-per-row algorithm where parallel threads within a warp
* work on strings in a single list row.
*
* @tparam HashFunction hash function to use on each string
*
* @param d_input List of strings to process
* @param seeds Seeds for hashing each string
* @param d_hashes Minhash output values (one per row)
*/
template <
typename HashFunction,
typename hash_value_type = std::
conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input,
cudf::device_span<hash_value_type const> seeds,
hash_value_type* d_hashes)
{
auto const idx = cudf::detail::grid_1d::global_thread_id();
auto const row_idx = idx / cudf::detail::warp_size;

if (row_idx >= d_input.size()) { return; }
if (d_input.is_null(row_idx)) { return; }

auto const d_row = cudf::list_device_view(d_input, row_idx);
auto const d_output = d_hashes + (row_idx * seeds.size());

// initialize hashes output for this row
auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
if (lane_idx == 0) {
auto const init = d_row.size() == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
}
__syncwarp();

// each lane hashes a string from the input row
for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) {
auto const hash_str =
d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element<cudf::string_view>(str_idx);
for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
auto const hasher = HashFunction(seeds[seed_idx]);
// hash string and store the min value
hash_value_type hv;
if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
hv = hasher(hash_str);
} else {
// This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
// but only uses the first uint64 value as requested by the LLM team.
hv = thrust::get<0>(hasher(hash_str));
}
cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
ref.fetch_min(hv, cuda::std::memory_order_relaxed);
}
}
}

template <
typename HashFunction,
typename hash_value_type = std::
conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
std::unique_ptr<cudf::column> word_minhash_fn(cudf::lists_column_view const& input,
cudf::device_span<hash_value_type const> seeds,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
"The number of seeds times the number of input rows exceeds the column size limit",
std::overflow_error);

auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
if (input.is_empty()) { return cudf::make_empty_column(output_type); }

auto const d_input = cudf::column_device_view::create(input.parent(), stream);

auto hashes = cudf::make_numeric_column(output_type,
input.size() * static_cast<cudf::size_type>(seeds.size()),
cudf::mask_state::UNALLOCATED,
stream,
mr);
auto d_hashes = hashes->mutable_view().data<hash_value_type>();
auto lcdv = cudf::detail::lists_column_device_view(*d_input);

constexpr cudf::thread_index_type block_size = 256;
cudf::detail::grid_1d grid{
static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
minhash_word_kernel<HashFunction>
<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(lcdv, seeds, d_hashes);

return hashes;
}

std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
std::unique_ptr<cudf::column>&& hashes,
cudf::size_type seeds_size,
rmm::cuda_stream_view stream,
Expand All @@ -176,7 +274,7 @@ std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const&
std::move(offsets),
std::move(hashes),
input.null_count(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
cudf::detail::copy_bitmask(input, stream, mr),
stream,
mr);
// expect this condition to be very rare
Expand Down Expand Up @@ -208,7 +306,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
{
using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
auto hashes = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
}

std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
Expand All @@ -232,7 +330,27 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
{
using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
auto hashes = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
}

std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
cudf::device_span<uint32_t const> seeds,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
auto hashes = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
}

std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
cudf::device_span<uint64_t const> seeds,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
auto hashes = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
}
} // namespace detail

Expand Down Expand Up @@ -276,4 +394,21 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
return detail::minhash64(input, seeds, width, stream, mr);
}

std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
cudf::device_span<uint32_t const> seeds,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::word_minhash(input, seeds, stream, mr);
}

std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
cudf::device_span<uint64_t const> seeds,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::word_minhash64(input, seeds, stream, mr);
}
} // namespace nvtext
Loading
Loading