From 2768783b9d7fdb4cda1a88756e8b0a493188294c Mon Sep 17 00:00:00 2001 From: davidwendt Date: Mon, 22 Feb 2021 17:10:54 -0500 Subject: [PATCH 1/2] Change nvtext::load_vocabulary_file to return shared ptr --- cpp/benchmarks/text/subword_benchmark.cpp | 4 ++-- cpp/include/nvtext/detail/load_hash_file.hpp | 9 +++++---- cpp/include/nvtext/subword_tokenize.hpp | 4 ++-- cpp/src/text/subword/load_hash_file.cu | 13 +++++++------ cpp/src/text/subword/subword_tokenize.cu | 6 +++--- cpp/tests/text/subword_tests.cpp | 4 ++-- 6 files changed, 21 insertions(+), 19 deletions(-) diff --git a/cpp/benchmarks/text/subword_benchmark.cpp b/cpp/benchmarks/text/subword_benchmark.cpp index d22f696c18d..3670fa7c9a7 100644 --- a/cpp/benchmarks/text/subword_benchmark.cpp +++ b/cpp/benchmarks/text/subword_benchmark.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -65,7 +65,7 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state) auto vocab = nvtext::load_vocabulary_file(hash_file); for (auto _ : state) { auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - vocab, + *vocab, max_sequence_length, stride, do_lower, diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp index a75ae3d6181..b92eaa74986 100644 --- a/cpp/include/nvtext/detail/load_hash_file.hpp +++ b/cpp/include/nvtext/detail/load_hash_file.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,9 +40,10 @@ namespace detail { * @param mr Memory resource to allocate any returned objects. * @return vocabulary hash-table elements */ -hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::shared_ptr load_vocabulary_file( + std::string const& filename_hashed_vocabulary, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace nvtext diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index 8f6cf7d7c14..6edb3b9c09f 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -59,7 +59,7 @@ struct hashed_vocabulary { * @param mr Memory resource to allocate any returned objects. * @return vocabulary hash-table elements */ -hashed_vocabulary load_vocabulary_file( +std::shared_ptr load_vocabulary_file( std::string const& filename_hashed_vocabulary, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index 2e98b88ccb4..a24d6968956 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -183,9 +183,10 @@ uint64_t str_to_uint64(std::string const& str, uint64_t line_no) * @param filename_hashed_vocabulary Path to text file containing hashed vocabulary * @return object containing hash table elements for the wordpiece tokenizer */ -hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::shared_ptr load_vocabulary_file( + std::string const& filename_hashed_vocabulary, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { hashed_vocabulary result; std::ifstream hash_file(filename_hashed_vocabulary); @@ -276,13 +277,13 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu detail::get_codepoint_metadata(stream); detail::get_aux_codepoint_data(stream); - return result; + return std::make_shared(std::move(result)); } } // namespace detail -hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary, - rmm::mr::device_memory_resource* mr) +std::shared_ptr load_vocabulary_file( + std::string const& filename_hashed_vocabulary, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::load_vocabulary_file(filename_hashed_vocabulary, rmm::cuda_stream_default, mr); diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 46a84575dc1..1639af0dbde 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -256,10 +256,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, uint32_t max_rows_tensor, rmm::mr::device_memory_resource* mr) { - hashed_vocabulary vocab_table = load_vocabulary_file(filename_hashed_vocabulary, mr); + auto vocab_table = load_vocabulary_file(filename_hashed_vocabulary, mr); CUDF_FUNC_RANGE(); return detail::subword_tokenize(strings, - vocab_table, + *vocab_table, max_sequence_length, stride, do_lower_case, diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index 3cab612fccd..7324cf3ec6a 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -237,7 +237,7 @@ TEST(TextSubwordTest, TokenizeFromVocabStruct) cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); auto vocab = nvtext::load_vocabulary_file(hash_file); auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - vocab, + *vocab, 8, 6, true, // do_lower_case @@ -307,7 +307,7 @@ TEST(TextSubwordTest, TokenizeWithSpecialTokens) cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); auto vocab = nvtext::load_vocabulary_file(hash_file); auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - vocab, + *vocab, 8, 6, true, // do_lower_case From 6efbf6ee76c1c40e6509c0b275e31318bc541661 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Mon, 22 Feb 2021 17:52:05 -0500 Subject: [PATCH 2/2] change shared_ptr to unique_ptr --- cpp/include/nvtext/detail/load_hash_file.hpp | 2 +- cpp/include/nvtext/subword_tokenize.hpp | 2 +- cpp/src/text/subword/load_hash_file.cu | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp index b92eaa74986..b105c5c280e 100644 --- a/cpp/include/nvtext/detail/load_hash_file.hpp +++ b/cpp/include/nvtext/detail/load_hash_file.hpp @@ -40,7 +40,7 @@ namespace detail { * @param mr Memory resource to allocate any returned objects. * @return vocabulary hash-table elements */ -std::shared_ptr load_vocabulary_file( +std::unique_ptr load_vocabulary_file( std::string const& filename_hashed_vocabulary, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index 6edb3b9c09f..8cc000ff095 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -59,7 +59,7 @@ struct hashed_vocabulary { * @param mr Memory resource to allocate any returned objects. * @return vocabulary hash-table elements */ -std::shared_ptr load_vocabulary_file( +std::unique_ptr load_vocabulary_file( std::string const& filename_hashed_vocabulary, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index a24d6968956..f3f96933f19 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -183,7 +183,7 @@ uint64_t str_to_uint64(std::string const& str, uint64_t line_no) * @param filename_hashed_vocabulary Path to text file containing hashed vocabulary * @return object containing hash table elements for the wordpiece tokenizer */ -std::shared_ptr load_vocabulary_file( +std::unique_ptr load_vocabulary_file( std::string const& filename_hashed_vocabulary, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -277,12 +277,12 @@ std::shared_ptr load_vocabulary_file( detail::get_codepoint_metadata(stream); detail::get_aux_codepoint_data(stream); - return std::make_shared(std::move(result)); + return std::make_unique(std::move(result)); } } // namespace detail -std::shared_ptr load_vocabulary_file( +std::unique_ptr load_vocabulary_file( std::string const& filename_hashed_vocabulary, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE();