From 0a9a3a9b2ff80e187f99c8266a89bc64ba44bc13 Mon Sep 17 00:00:00 2001 From: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Date: Tue, 28 Feb 2023 15:45:32 -0800 Subject: [PATCH] Fix the build breaks the release pipeline and some C++ warnings (#372) * fix the break in release pipeline * code cleanup and the warnings fixing. * Update ci.yml for Azure Pipelines * Update ci.yml for Azure Pipelines * fix linux build * one more fixing * again? * fixing for macOS --- .gitignore | 2 +- .pipelines/ci.yml | 16 ++++ CMakeLists.txt | 7 +- {operators => base}/base64.cc | 0 {operators => base}/base64.h | 0 {operators => base}/narrow.h | 0 {operators => base}/ocos.cc | 0 {operators => base}/string_tensor.cc | 2 +- {operators => base}/string_tensor.h | 2 +- {operators => base}/string_utils.cc | 0 {operators => base}/string_utils.h | 0 {operators => base}/ustring.cc | 3 +- {operators => base}/ustring.h | 9 +- includes/ocos.h | 2 + operators/tokenizer/basic_tokenizer.cc | 3 +- operators/tokenizer/bert_tokenizer.hpp | 5 +- .../tokenizer/bert_tokenizer_decoder.hpp | 2 - operators/tokenizer/bpetokenizer.hpp | 65 ++++++++++---- operators/tokenizer/clip_tokenizer.cc | 66 +-------------- operators/tokenizer/clip_tokenizer.hpp | 9 +- operators/tokenizer/gpt2_tokenizer.cc | 74 +--------------- operators/tokenizer/gpt2_tokenizer.hpp | 8 +- operators/tokenizer/roberta_tokenizer.cc | 84 +++---------------- operators/tokenizer/roberta_tokenizer.hpp | 12 +-- operators/tokenizer/wordpiece_tokenizer.hpp | 5 +- test/test_cliptok.py | 4 +- test/test_gpt2tok.py | 6 -- test/test_robertatok.py | 4 +- 28 files changed, 123 insertions(+), 267 deletions(-) rename {operators => base}/base64.cc (100%) rename {operators => base}/base64.h (100%) rename {operators => base}/narrow.h (100%) rename {operators => base}/ocos.cc (100%) rename {operators => base}/string_tensor.cc (100%) rename {operators => base}/string_tensor.h (100%) rename {operators => base}/string_utils.cc (100%) rename {operators => base}/string_utils.h (100%) rename {operators => base}/ustring.cc (99%) rename {operators => base}/ustring.h (94%) diff --git a/.gitignore b/.gitignore index 2377dfe4f..0905bbd98 100644 --- a/.gitignore +++ b/.gitignore @@ -38,7 +38,7 @@ out/ .scb/ onnxruntime_extensions/_version.py onnxruntime-*-*-*/ -temp_*.onnx +temp_*onnx* # Java specific ignores */.gradle java/hs_*.log diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml index a14653aa3..5a50296a6 100644 --- a/.pipelines/ci.yml +++ b/.pipelines/ci.yml @@ -478,3 +478,19 @@ jobs: echo "Exception propogation was not enabled correctly." exit 1 fi + + + ############################## + # Linux for selected_ops build + ############################## + - job: Linux_SelectedOpsBuild + pool: + vmImage: 'ubuntu-latest' + + steps: + # compiled as only one operator selected. + - bash: | + set -e -x -u + echo 'set (OCOS_ENABLE_BERT_TOKENIZER ON CACHE BOOL "" FORCE)' > cmake/_selectedoplist.cmake + ./build.sh -DOCOS_ENABLE_CPP_EXCEPTIONS=OFF -DOCOS_ENABLE_SELECTED_OPLIST=ON + displayName: Build ort-extensions with only one operator was selected diff --git a/CMakeLists.txt b/CMakeLists.txt index 38be4d988..db5d5846f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,6 +208,9 @@ endif() if(NOT OCOS_ENABLE_CPP_EXCEPTIONS) add_compile_definitions(OCOS_NO_EXCEPTIONS ORT_NO_EXCEPTIONS) + if (NOT _ONNXRUNTIME_EMBEDDED) + add_compile_definitions(_HAS_EXCEPTIONS=0) + endif() endif() include(FetchContent) @@ -254,7 +257,7 @@ if(OCOS_ENABLE_RE2_REGEX) endif() # ### scan all source files -set(TARGET_SRC_NOEXCEPTION) +file(GLOB TARGET_SRC_NOEXCEPTION "base/*.h" "base/*.cc") file(GLOB TARGET_SRC "operators/*.cc" "operators/*.h" "includes/*.h*") if(OCOS_ENABLE_TF_STRING) @@ -402,11 +405,13 @@ standardize_output_folder(ocos_operators) target_include_directories(noexcep_operators PUBLIC ${ONNXRUNTIME_INCLUDE_DIR} ${PROJECT_SOURCE_DIR}/includes + ${PROJECT_SOURCE_DIR}/base ${PROJECT_SOURCE_DIR}/operators) target_include_directories(ocos_operators PUBLIC ${ONNXRUNTIME_INCLUDE_DIR} ${PROJECT_SOURCE_DIR}/includes + ${PROJECT_SOURCE_DIR}/base ${PROJECT_SOURCE_DIR}/operators ${PROJECT_SOURCE_DIR}/operators/tokenizer) diff --git a/operators/base64.cc b/base/base64.cc similarity index 100% rename from operators/base64.cc rename to base/base64.cc diff --git a/operators/base64.h b/base/base64.h similarity index 100% rename from operators/base64.h rename to base/base64.h diff --git a/operators/narrow.h b/base/narrow.h similarity index 100% rename from operators/narrow.h rename to base/narrow.h diff --git a/operators/ocos.cc b/base/ocos.cc similarity index 100% rename from operators/ocos.cc rename to base/ocos.cc diff --git a/operators/string_tensor.cc b/base/string_tensor.cc similarity index 100% rename from operators/string_tensor.cc rename to base/string_tensor.cc index 9e832c1c5..569a39e7b 100644 --- a/operators/string_tensor.cc +++ b/base/string_tensor.cc @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "string_tensor.h" #include "string_utils.h" #include "ustring.h" -#include "string_tensor.h" void GetTensorMutableDataString(const OrtApi& api, OrtW::CustomOpApi& ort, OrtKernelContext* context, const OrtValue* value, std::vector& output) { diff --git a/operators/string_tensor.h b/base/string_tensor.h similarity index 100% rename from operators/string_tensor.h rename to base/string_tensor.h index 6b99fe1c0..469db8da8 100644 --- a/operators/string_tensor.h +++ b/base/string_tensor.h @@ -3,8 +3,8 @@ #pragma once -#include #include "ocos.h" +#include // Retrieves a vector of strings if the input type is std::string. diff --git a/operators/string_utils.cc b/base/string_utils.cc similarity index 100% rename from operators/string_utils.cc rename to base/string_utils.cc diff --git a/operators/string_utils.h b/base/string_utils.h similarity index 100% rename from operators/string_utils.h rename to base/string_utils.h diff --git a/operators/ustring.cc b/base/ustring.cc similarity index 99% rename from operators/ustring.cc rename to base/ustring.cc index 1a06392bf..9ac9a8eb0 100644 --- a/operators/ustring.cc +++ b/base/ustring.cc @@ -1,7 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include #include "ustring.h" +#include + ustring::ustring() : std::u32string() { } diff --git a/operators/ustring.h b/base/ustring.h similarity index 94% rename from operators/ustring.h rename to base/ustring.h index 9e960ee70..e20f90210 100644 --- a/operators/ustring.h +++ b/base/ustring.h @@ -1,15 +1,12 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #pragma once -#include -#include -#include - -#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING 1 -#include #include "ocos.h" +#include +#include +// ustring needs a new implementation, due to the std::codecvt deprecation. // Wrap u32string with ustring, in case we will use other implementation in the future class ustring : public std::u32string { public: diff --git a/includes/ocos.h b/includes/ocos.h index b61d1c585..32be5a69a 100644 --- a/includes/ocos.h +++ b/includes/ocos.h @@ -3,6 +3,8 @@ #pragma once +#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING +#include #include #include #include diff --git a/operators/tokenizer/basic_tokenizer.cc b/operators/tokenizer/basic_tokenizer.cc index d57ae1d4b..ce87c67e7 100644 --- a/operators/tokenizer/basic_tokenizer.cc +++ b/operators/tokenizer/basic_tokenizer.cc @@ -1,12 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "string_utils.h" #include "basic_tokenizer.hpp" +#include "string_utils.h" #include "string_tensor.h" #include #include -#include #include BasicTokenizer::BasicTokenizer(bool do_lower_case, bool tokenize_chinese_chars, bool strip_accents, diff --git a/operators/tokenizer/bert_tokenizer.hpp b/operators/tokenizer/bert_tokenizer.hpp index 7b3f320b7..4faddd067 100644 --- a/operators/tokenizer/bert_tokenizer.hpp +++ b/operators/tokenizer/bert_tokenizer.hpp @@ -3,14 +3,15 @@ #pragma once -#include -#include #include "ocos.h" #include "ustring.h" #include "string_utils.h" #include "string_tensor.h" #include "basic_tokenizer.hpp" +#include + + class BertTokenizerVocab final { public: explicit BertTokenizerVocab(std::string_view vocab); diff --git a/operators/tokenizer/bert_tokenizer_decoder.hpp b/operators/tokenizer/bert_tokenizer_decoder.hpp index a0863d434..c1e20b961 100644 --- a/operators/tokenizer/bert_tokenizer_decoder.hpp +++ b/operators/tokenizer/bert_tokenizer_decoder.hpp @@ -3,8 +3,6 @@ #pragma once -#include -#include #include "ocos.h" #include "ustring.h" #include "string_utils.h" diff --git a/operators/tokenizer/bpetokenizer.hpp b/operators/tokenizer/bpetokenizer.hpp index 269ad8119..fcc8e1f63 100644 --- a/operators/tokenizer/bpetokenizer.hpp +++ b/operators/tokenizer/bpetokenizer.hpp @@ -1,28 +1,58 @@ // Licensed under the MIT License. // Partial code comes from other Microsoft employee. #pragma once -#include -#include -#include -#include -#include -#include -#include -#include +#include "ocos.h" +#include "ustring.h" + #include -#include -#include +#include #include -#include -#include -#include +#include "unicode.h" #include "nlohmann/json.hpp" -#include "clip_tokenizer.hpp" -#include "gpt2_tokenizer.hpp" -#include "roberta_tokenizer.hpp" +#include "string_utils.h" #include "string_tensor.h" -#include "unicode.h" + +// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace) +inline bool IsUnicodeSpace(char32_t ch) { + switch (ch) { + case 0x0009: + case 0x000A: + case 0x000B: + case 0x000C: + case 0x000D: + case 0x001C: + case 0x001D: + case 0x001E: + case 0x001F: + case 0x0020: + case 0x0085: + case 0x00A0: + case 0x1680: + case 0x2000: + case 0x2001: + case 0x2002: + case 0x2003: + case 0x2004: + case 0x2005: + case 0x2006: + case 0x2007: + case 0x2008: + case 0x2009: + case 0x200A: + case 0x2028: + case 0x2029: + case 0x202F: + case 0x205F: + case 0x3000: + return true; + } + return false; +} + +inline bool IsEmptyUString(const ustring& str) { + return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); }); +} class SpecialTokenMap { public: @@ -117,7 +147,6 @@ class VocabData { } else { int id = static_cast(vocab_map_.size()); vocab_map_[unk_token] = id; - std::cerr << "Special token (" << unk_token << ") have been added in the vocabulary." << std::endl; } std::wstring_convert, char32_t> str_convert; diff --git a/operators/tokenizer/clip_tokenizer.cc b/operators/tokenizer/clip_tokenizer.cc index e123a62f2..1f565c820 100644 --- a/operators/tokenizer/clip_tokenizer.cc +++ b/operators/tokenizer/clip_tokenizer.cc @@ -1,68 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. // Partial code comes from other Microsoft employee. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "nlohmann/json.hpp" -#include "bpetokenizer.hpp" -#include "string_tensor.h" -#include "unicode.h" - -// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace) -bool IsInUnicodeSpace(char32_t ch) { - switch (ch) { - case 0x0009: - case 0x000A: - case 0x000B: - case 0x000C: - case 0x000D: - case 0x001C: - case 0x001D: - case 0x001E: - case 0x001F: - case 0x0020: - case 0x0085: - case 0x00A0: - case 0x1680: - case 0x2000: - case 0x2001: - case 0x2002: - case 0x2003: - case 0x2004: - case 0x2005: - case 0x2006: - case 0x2007: - case 0x2008: - case 0x2009: - case 0x200A: - case 0x2028: - case 0x2029: - case 0x202F: - case 0x205F: - case 0x3000: - return true; - } - return false; -} - -bool IsEmptyUstring(const ustring& str) { - return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsInUnicodeSpace(ch); }); -} +#include "clip_tokenizer.hpp" +#include "string_utils.h" KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info) : BaseKernel(api, info) { @@ -93,7 +33,7 @@ KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKerne std::vector KernelClipBpeTokenizer::Tokenize(ustring& input, int64_t max_length) { std::vector res; - if (IsEmptyUstring(input)) { + if (IsEmptyUString(input)) { return res; } // Add <|startoftext|> token to result diff --git a/operators/tokenizer/clip_tokenizer.hpp b/operators/tokenizer/clip_tokenizer.hpp index 8489e83a6..d5387919c 100644 --- a/operators/tokenizer/clip_tokenizer.hpp +++ b/operators/tokenizer/clip_tokenizer.hpp @@ -1,9 +1,8 @@ -#include -#include "ocos.h" -#include "ustring.h" -#include "string_utils.h" +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. -class VocabData; +#pragma once +#include "bpetokenizer.hpp" struct KernelClipBpeTokenizer : BaseKernel { KernelClipBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info); diff --git a/operators/tokenizer/gpt2_tokenizer.cc b/operators/tokenizer/gpt2_tokenizer.cc index 5aba4a477..2a3234c57 100644 --- a/operators/tokenizer/gpt2_tokenizer.cc +++ b/operators/tokenizer/gpt2_tokenizer.cc @@ -2,66 +2,8 @@ // Licensed under the MIT License. // Partial code comes from other Microsoft employee. -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "nlohmann/json.hpp" -#include "bpetokenizer.hpp" -#include "string_tensor.h" -#include "unicode.h" - -// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace) -bool IsUnicodeSpace(char32_t ch) { - switch (ch) { - case 0x0009: - case 0x000A: - case 0x000B: - case 0x000C: - case 0x000D: - case 0x001C: - case 0x001D: - case 0x001E: - case 0x001F: - case 0x0020: - case 0x0085: - case 0x00A0: - case 0x1680: - case 0x2000: - case 0x2001: - case 0x2002: - case 0x2003: - case 0x2004: - case 0x2005: - case 0x2006: - case 0x2007: - case 0x2008: - case 0x2009: - case 0x200A: - case 0x2028: - case 0x2029: - case 0x202F: - case 0x205F: - case 0x3000: - return true; - } - return false; -} +#include "gpt2_tokenizer.hpp" -bool IsEmptyUString(const ustring& str) { - return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); }); -} KernelBpeTokenizer::KernelBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info) : BaseKernel(api, info) { @@ -200,17 +142,3 @@ size_t CustomOpBpeTokenizer::GetOutputTypeCount() const { ONNXTensorElementDataType CustomOpBpeTokenizer::GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; } - -const OrtCustomOp** LoadTokenizerSchemaList() { - // create the global objects here to let the ORT catch the expection if any - static std::unique_ptr p_CoBpeTokenizer; - static const OrtCustomOp* c_CustomOpList[2] = {nullptr}; // {&c_CoBpeTokenizer, nullptr}; - static std::mutex mtx_loaded; - std::lock_guard lck(mtx_loaded); - if (p_CoBpeTokenizer.get() == nullptr) { - p_CoBpeTokenizer = std::make_unique(); - c_CustomOpList[0] = p_CoBpeTokenizer.get(); - } - - return c_CustomOpList; -} diff --git a/operators/tokenizer/gpt2_tokenizer.hpp b/operators/tokenizer/gpt2_tokenizer.hpp index ed1625d98..31b2bd2d2 100644 --- a/operators/tokenizer/gpt2_tokenizer.hpp +++ b/operators/tokenizer/gpt2_tokenizer.hpp @@ -1,8 +1,8 @@ -#include -#include "ocos.h" -#include "ustring.h" +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. -class VocabData; +#pragma once +#include "bpetokenizer.hpp" struct KernelBpeTokenizer : BaseKernel { KernelBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info); diff --git a/operators/tokenizer/roberta_tokenizer.cc b/operators/tokenizer/roberta_tokenizer.cc index 4737c054e..c886ce7e7 100644 --- a/operators/tokenizer/roberta_tokenizer.cc +++ b/operators/tokenizer/roberta_tokenizer.cc @@ -2,67 +2,9 @@ // Licensed under the MIT License. // Partial code comes from other Microsoft employee. -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "nlohmann/json.hpp" -#include "bpetokenizer.hpp" -#include "string_tensor.h" -#include "unicode.h" - -// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace) -bool IsWithinUnicodeSpace(char32_t ch) { - switch (ch) { - case 0x0009: - case 0x000A: - case 0x000B: - case 0x000C: - case 0x000D: - case 0x001C: - case 0x001D: - case 0x001E: - case 0x001F: - case 0x0020: - case 0x0085: - case 0x00A0: - case 0x1680: - case 0x2000: - case 0x2001: - case 0x2002: - case 0x2003: - case 0x2004: - case 0x2005: - case 0x2006: - case 0x2007: - case 0x2008: - case 0x2009: - case 0x200A: - case 0x2028: - case 0x2029: - case 0x202F: - case 0x205F: - case 0x3000: - return true; - } - return false; -} +#include "roberta_tokenizer.hpp" +#include "narrow.h" -bool IsEmptyuString(const ustring& str) { - return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsWithinUnicodeSpace(ch); }); -} KernelRobertaBpeTokenizer::KernelRobertaBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info) : BaseKernel(api, info) { @@ -90,10 +32,10 @@ KernelRobertaBpeTokenizer::KernelRobertaBpeTokenizer(const OrtApi& api, const Or bbpe_tokenizer_->Load(vocabu_stream, merges_stream, "<|endoftext|>", "<|endoftext|>"); } -std::vector KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t max_length, std::list>>& offset_map) { +std::vector KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t max_length, std::list& offset_map) { std::vector res; - if (IsEmptyuString(input)) { + if (IsEmptyUString(input)) { return res; } // Add BOS token to result @@ -116,8 +58,8 @@ std::vector KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t const char32_t* ptr = cur_input.c_str(); regcmp.Set(ptr); - int offset = 0; - std::list> offset_mapping; + size_t offset = 0; + OffsetMappingType offset_mapping; // Add offset mapping for BOS token offset_mapping.push_back(std::make_pair(0, 0)); @@ -130,16 +72,16 @@ std::vector KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t // Handle offset mapping and special cases if (utf8_token.at(0) == ' ') { - offset_mapping.push_back(std::make_pair(offset + 1, offset + utf8_token.size())); + offset_mapping.emplace_back(std::make_pair(offset + 1, ort_extensions::narrow(offset + utf8_token.size()))); } else { - offset_mapping.push_back(std::make_pair(offset, offset + utf8_token.size())); + offset_mapping.emplace_back(std::make_pair(offset, ort_extensions::narrow(offset + utf8_token.size()))); } offset += utf8_token.size(); // Get byte encodings prior to performing BPE byte_list_.clear(); for (char& cp : utf8_token) { - byte_list_.push_back(bbpe_tokenizer_->ByteEncoder()[static_cast(cp)]); + byte_list_.emplace_back(bbpe_tokenizer_->ByteEncoder()[static_cast(cp)]); } // Perform BPE @@ -155,13 +97,13 @@ std::vector KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t } } // Add offset mapping for EOS token - offset_mapping.push_back(std::make_pair(0, 0)); + offset_mapping.emplace_back(std::make_pair(0, 0)); // Add offset mappings for input in this instance to list of offset mappings for all inputs - offset_map.push_back(offset_mapping); + offset_map.emplace_back(offset_mapping); } // Add EOS token to result - res.push_back(bbpe_tokenizer_->GetEncoding("")); + res.emplace_back(bbpe_tokenizer_->GetEncoding("")); return res; } @@ -169,7 +111,7 @@ void KernelRobertaBpeTokenizer::Compute(OrtKernelContext* context) { // Setup inputs const OrtValue* input = ort_.KernelContext_GetInput(context, 0); std::vector str_input; - std::list>> offset_map; + std::list offset_map; GetTensorMutableDataString(api_, ort_, context, input, str_input); OrtTensorDimensions input_dim(ort_, input); diff --git a/operators/tokenizer/roberta_tokenizer.hpp b/operators/tokenizer/roberta_tokenizer.hpp index e0252a304..b499b6866 100644 --- a/operators/tokenizer/roberta_tokenizer.hpp +++ b/operators/tokenizer/roberta_tokenizer.hpp @@ -1,16 +1,16 @@ -#include -#include "ocos.h" -#include "ustring.h" -#include "string_utils.h" +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. -class VocabData; +#pragma once +#include "bpetokenizer.hpp" struct KernelRobertaBpeTokenizer : BaseKernel { KernelRobertaBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info); void Compute(OrtKernelContext* context); private: - std::vector Tokenize(ustring& input, int64_t max_length, std::list>>& offset_map); + using OffsetMappingType = std::list>; + std::vector Tokenize(ustring& input, int64_t max_length, std::list& offset_map); int64_t padding_length_; std::list byte_list_; diff --git a/operators/tokenizer/wordpiece_tokenizer.hpp b/operators/tokenizer/wordpiece_tokenizer.hpp index 1bdb8d893..a34b82cdb 100644 --- a/operators/tokenizer/wordpiece_tokenizer.hpp +++ b/operators/tokenizer/wordpiece_tokenizer.hpp @@ -3,13 +3,14 @@ #pragma once -#include -#include #include "ocos.h" #include "ustring.h" #include "string_utils.h" #include "string_tensor.h" +#include + + struct KernelWordpieceTokenizer : BaseKernel { KernelWordpieceTokenizer(const OrtApi& api, const OrtKernelInfo& info); void Compute(OrtKernelContext* context); diff --git a/test/test_cliptok.py b/test/test_cliptok.py index ce8fc49c2..028eecd56 100644 --- a/test/test_cliptok.py +++ b/test/test_cliptok.py @@ -39,7 +39,9 @@ class TestCLIPTokenizer(unittest.TestCase): @classmethod def setUpClass(cls): cls.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") - files = cls.tokenizer.save_vocabulary(".") + temp_dir = Path('./temp_onnxclip') + temp_dir.mkdir(parents=True, exist_ok=True) + files = cls.tokenizer.save_vocabulary(str(temp_dir)) cls.tokjson = files[0] cls.merges = files[1] diff --git a/test/test_gpt2tok.py b/test/test_gpt2tok.py index 2fae5988c..6b1988c68 100644 --- a/test/test_gpt2tok.py +++ b/test/test_gpt2tok.py @@ -2,7 +2,6 @@ import numpy as np import onnxruntime as _ort -from pathlib import Path from onnx import helper, onnx_pb as onnx_proto from transformers import GPT2Tokenizer from onnxruntime_extensions import ( @@ -17,11 +16,6 @@ def _get_file_content(path): return file.read() -def _get_test_data_file(*sub_dirs): - test_dir = Path(__file__).parent - return str(test_dir.joinpath(*sub_dirs)) - - def _create_test_model(**kwargs): vocab_file = kwargs["vocab_file"] merges_file = kwargs["merges_file"] diff --git a/test/test_robertatok.py b/test/test_robertatok.py index 053a23ac6..4eb2681e1 100644 --- a/test/test_robertatok.py +++ b/test/test_robertatok.py @@ -42,7 +42,9 @@ class TestRobertaTokenizer(unittest.TestCase): @classmethod def setUpClass(cls): cls.tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") - files = cls.tokenizer.save_vocabulary(".") + temp_dir = Path('./temp_onnxroberta') + temp_dir.mkdir(parents=True, exist_ok=True) + files = cls.tokenizer.save_vocabulary(str(temp_dir)) cls.tokjson = files[0] cls.merges = files[1]