From b0e5aef46a0fc23a67bb2980d5e7707870bb15c4 Mon Sep 17 00:00:00 2001 From: David <45795991+davidwendt@users.noreply.github.com> Date: Wed, 24 Feb 2021 08:26:36 -0500 Subject: [PATCH] Change nvtext::load_vocabulary_file to return a unique ptr (#7424) Reference #5868 This PR changes the `nvtext::load_vocabulary_file` to return a unique-pointer to make it easier to manage in Python/Cython class object. The original signature returned a flat structure that contained unique-pointers which would make it difficult to copy and manage. The corresponding gtests and gbenchmarks were updated for this API change. Authors: - David (@davidwendt) Approvers: - Conor Hoekstra (@codereport) - Karthikeyan (@karthikeyann) URL: https://github.com/rapidsai/cudf/pull/7424 --- cpp/benchmarks/text/subword_benchmark.cpp | 4 ++-- cpp/include/nvtext/detail/load_hash_file.hpp | 9 +++++---- cpp/include/nvtext/subword_tokenize.hpp | 4 ++-- cpp/src/text/subword/load_hash_file.cu | 13 +++++++------ cpp/src/text/subword/subword_tokenize.cu | 6 +++--- cpp/tests/text/subword_tests.cpp | 4 ++-- 6 files changed, 21 insertions(+), 19 deletions(-) diff --git a/cpp/benchmarks/text/subword_benchmark.cpp b/cpp/benchmarks/text/subword_benchmark.cpp index d22f696c18d..3670fa7c9a7 100644 --- a/cpp/benchmarks/text/subword_benchmark.cpp +++ b/cpp/benchmarks/text/subword_benchmark.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -65,7 +65,7 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state) auto vocab = nvtext::load_vocabulary_file(hash_file); for (auto _ : state) { auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - vocab, + *vocab, max_sequence_length, stride, do_lower, diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp index a75ae3d6181..b105c5c280e 100644 --- a/cpp/include/nvtext/detail/load_hash_file.hpp +++ b/cpp/include/nvtext/detail/load_hash_file.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,9 +40,10 @@ namespace detail { * @param mr Memory resource to allocate any returned objects. * @return vocabulary hash-table elements */ -hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr load_vocabulary_file( + std::string const& filename_hashed_vocabulary, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace nvtext diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index 8f6cf7d7c14..8cc000ff095 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -59,7 +59,7 @@ struct hashed_vocabulary { * @param mr Memory resource to allocate any returned objects. * @return vocabulary hash-table elements */ -hashed_vocabulary load_vocabulary_file( +std::unique_ptr load_vocabulary_file( std::string const& filename_hashed_vocabulary, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index 2e98b88ccb4..f3f96933f19 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -183,9 +183,10 @@ uint64_t str_to_uint64(std::string const& str, uint64_t line_no) * @param filename_hashed_vocabulary Path to text file containing hashed vocabulary * @return object containing hash table elements for the wordpiece tokenizer */ -hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr load_vocabulary_file( + std::string const& filename_hashed_vocabulary, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { hashed_vocabulary result; std::ifstream hash_file(filename_hashed_vocabulary); @@ -276,13 +277,13 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu detail::get_codepoint_metadata(stream); detail::get_aux_codepoint_data(stream); - return result; + return std::make_unique(std::move(result)); } } // namespace detail -hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary, - rmm::mr::device_memory_resource* mr) +std::unique_ptr load_vocabulary_file( + std::string const& filename_hashed_vocabulary, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::load_vocabulary_file(filename_hashed_vocabulary, rmm::cuda_stream_default, mr); diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 46a84575dc1..1639af0dbde 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -256,10 +256,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, uint32_t max_rows_tensor, rmm::mr::device_memory_resource* mr) { - hashed_vocabulary vocab_table = load_vocabulary_file(filename_hashed_vocabulary, mr); + auto vocab_table = load_vocabulary_file(filename_hashed_vocabulary, mr); CUDF_FUNC_RANGE(); return detail::subword_tokenize(strings, - vocab_table, + *vocab_table, max_sequence_length, stride, do_lower_case, diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index 3cab612fccd..7324cf3ec6a 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -237,7 +237,7 @@ TEST(TextSubwordTest, TokenizeFromVocabStruct) cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); auto vocab = nvtext::load_vocabulary_file(hash_file); auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - vocab, + *vocab, 8, 6, true, // do_lower_case @@ -307,7 +307,7 @@ TEST(TextSubwordTest, TokenizeWithSpecialTokens) cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); auto vocab = nvtext::load_vocabulary_file(hash_file); auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - vocab, + *vocab, 8, 6, true, // do_lower_case