From f2dcd19f5d703b8ab034e9e36f6228af01e81763 Mon Sep 17 00:00:00 2001 From: Wenbing Li Date: Mon, 27 Feb 2023 18:06:05 -0800 Subject: [PATCH 1/8] fix the break in release pipeline --- CMakeLists.txt | 7 +++- {operators => base}/base64.cc | 0 {operators => base}/base64.h | 0 {operators => base}/narrow.h | 0 {operators => base}/ocos.cc | 0 {operators => base}/string_tensor.cc | 0 {operators => base}/string_tensor.h | 0 {operators => base}/string_utils.cc | 0 {operators => base}/string_utils.h | 0 {operators => base}/ustring.cc | 0 {operators => base}/ustring.h | 1 + operators/tokenizer/bpetokenizer.hpp | 46 ++++++++++++++++++++-- operators/tokenizer/clip_tokenizer.cc | 48 +---------------------- operators/tokenizer/clip_tokenizer.hpp | 2 +- operators/tokenizer/gpt2_tokenizer.cc | 45 +-------------------- operators/tokenizer/gpt2_tokenizer.hpp | 3 +- operators/tokenizer/roberta_tokenizer.cc | 48 +---------------------- operators/tokenizer/roberta_tokenizer.hpp | 3 +- 18 files changed, 57 insertions(+), 146 deletions(-) rename {operators => base}/base64.cc (100%) rename {operators => base}/base64.h (100%) rename {operators => base}/narrow.h (100%) rename {operators => base}/ocos.cc (100%) rename {operators => base}/string_tensor.cc (100%) rename {operators => base}/string_tensor.h (100%) rename {operators => base}/string_utils.cc (100%) rename {operators => base}/string_utils.h (100%) rename {operators => base}/ustring.cc (100%) rename {operators => base}/ustring.h (96%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 38be4d988..db5d5846f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,6 +208,9 @@ endif() if(NOT OCOS_ENABLE_CPP_EXCEPTIONS) add_compile_definitions(OCOS_NO_EXCEPTIONS ORT_NO_EXCEPTIONS) + if (NOT _ONNXRUNTIME_EMBEDDED) + add_compile_definitions(_HAS_EXCEPTIONS=0) + endif() endif() include(FetchContent) @@ -254,7 +257,7 @@ if(OCOS_ENABLE_RE2_REGEX) endif() # ### scan all source files -set(TARGET_SRC_NOEXCEPTION) +file(GLOB TARGET_SRC_NOEXCEPTION "base/*.h" "base/*.cc") file(GLOB TARGET_SRC "operators/*.cc" "operators/*.h" "includes/*.h*") if(OCOS_ENABLE_TF_STRING) @@ -402,11 +405,13 @@ standardize_output_folder(ocos_operators) target_include_directories(noexcep_operators PUBLIC ${ONNXRUNTIME_INCLUDE_DIR} ${PROJECT_SOURCE_DIR}/includes + ${PROJECT_SOURCE_DIR}/base ${PROJECT_SOURCE_DIR}/operators) target_include_directories(ocos_operators PUBLIC ${ONNXRUNTIME_INCLUDE_DIR} ${PROJECT_SOURCE_DIR}/includes + ${PROJECT_SOURCE_DIR}/base ${PROJECT_SOURCE_DIR}/operators ${PROJECT_SOURCE_DIR}/operators/tokenizer) diff --git a/operators/base64.cc b/base/base64.cc similarity index 100% rename from operators/base64.cc rename to base/base64.cc diff --git a/operators/base64.h b/base/base64.h similarity index 100% rename from operators/base64.h rename to base/base64.h diff --git a/operators/narrow.h b/base/narrow.h similarity index 100% rename from operators/narrow.h rename to base/narrow.h diff --git a/operators/ocos.cc b/base/ocos.cc similarity index 100% rename from operators/ocos.cc rename to base/ocos.cc diff --git a/operators/string_tensor.cc b/base/string_tensor.cc similarity index 100% rename from operators/string_tensor.cc rename to base/string_tensor.cc diff --git a/operators/string_tensor.h b/base/string_tensor.h similarity index 100% rename from operators/string_tensor.h rename to base/string_tensor.h diff --git a/operators/string_utils.cc b/base/string_utils.cc similarity index 100% rename from operators/string_utils.cc rename to base/string_utils.cc diff --git a/operators/string_utils.h b/base/string_utils.h similarity index 100% rename from operators/string_utils.h rename to base/string_utils.h diff --git a/operators/ustring.cc b/base/ustring.cc similarity index 100% rename from operators/ustring.cc rename to base/ustring.cc diff --git a/operators/ustring.h b/base/ustring.h similarity index 96% rename from operators/ustring.h rename to base/ustring.h index 9e960ee70..cf817df56 100644 --- a/operators/ustring.h +++ b/base/ustring.h @@ -10,6 +10,7 @@ #include "ocos.h" +// ustring needs a new implementation, due to the std::codecvt deprecation. // Wrap u32string with ustring, in case we will use other implementation in the future class ustring : public std::u32string { public: diff --git a/operators/tokenizer/bpetokenizer.hpp b/operators/tokenizer/bpetokenizer.hpp index 269ad8119..0b7d093b7 100644 --- a/operators/tokenizer/bpetokenizer.hpp +++ b/operators/tokenizer/bpetokenizer.hpp @@ -17,12 +17,50 @@ #include #include +#include "unicode.h" #include "nlohmann/json.hpp" -#include "clip_tokenizer.hpp" -#include "gpt2_tokenizer.hpp" -#include "roberta_tokenizer.hpp" #include "string_tensor.h" -#include "unicode.h" + +// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace) +inline bool IsUnicodeSpace(char32_t ch) { + switch (ch) { + case 0x0009: + case 0x000A: + case 0x000B: + case 0x000C: + case 0x000D: + case 0x001C: + case 0x001D: + case 0x001E: + case 0x001F: + case 0x0020: + case 0x0085: + case 0x00A0: + case 0x1680: + case 0x2000: + case 0x2001: + case 0x2002: + case 0x2003: + case 0x2004: + case 0x2005: + case 0x2006: + case 0x2007: + case 0x2008: + case 0x2009: + case 0x200A: + case 0x2028: + case 0x2029: + case 0x202F: + case 0x205F: + case 0x3000: + return true; + } + return false; +} + +inline bool IsEmptyUString(const ustring& str) { + return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); }); +} class SpecialTokenMap { public: diff --git a/operators/tokenizer/clip_tokenizer.cc b/operators/tokenizer/clip_tokenizer.cc index e123a62f2..434daf901 100644 --- a/operators/tokenizer/clip_tokenizer.cc +++ b/operators/tokenizer/clip_tokenizer.cc @@ -18,51 +18,7 @@ #include #include -#include "nlohmann/json.hpp" -#include "bpetokenizer.hpp" -#include "string_tensor.h" -#include "unicode.h" - -// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace) -bool IsInUnicodeSpace(char32_t ch) { - switch (ch) { - case 0x0009: - case 0x000A: - case 0x000B: - case 0x000C: - case 0x000D: - case 0x001C: - case 0x001D: - case 0x001E: - case 0x001F: - case 0x0020: - case 0x0085: - case 0x00A0: - case 0x1680: - case 0x2000: - case 0x2001: - case 0x2002: - case 0x2003: - case 0x2004: - case 0x2005: - case 0x2006: - case 0x2007: - case 0x2008: - case 0x2009: - case 0x200A: - case 0x2028: - case 0x2029: - case 0x202F: - case 0x205F: - case 0x3000: - return true; - } - return false; -} - -bool IsEmptyUstring(const ustring& str) { - return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsInUnicodeSpace(ch); }); -} +#include "clip_tokenizer.hpp" KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info) : BaseKernel(api, info) { @@ -93,7 +49,7 @@ KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKerne std::vector KernelClipBpeTokenizer::Tokenize(ustring& input, int64_t max_length) { std::vector res; - if (IsEmptyUstring(input)) { + if (IsEmptyUString(input)) { return res; } // Add <|startoftext|> token to result diff --git a/operators/tokenizer/clip_tokenizer.hpp b/operators/tokenizer/clip_tokenizer.hpp index 8489e83a6..8cf4b0c58 100644 --- a/operators/tokenizer/clip_tokenizer.hpp +++ b/operators/tokenizer/clip_tokenizer.hpp @@ -3,7 +3,7 @@ #include "ustring.h" #include "string_utils.h" -class VocabData; +#include "bpetokenizer.hpp" struct KernelClipBpeTokenizer : BaseKernel { KernelClipBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info); diff --git a/operators/tokenizer/gpt2_tokenizer.cc b/operators/tokenizer/gpt2_tokenizer.cc index 5aba4a477..2f0501b76 100644 --- a/operators/tokenizer/gpt2_tokenizer.cc +++ b/operators/tokenizer/gpt2_tokenizer.cc @@ -17,51 +17,8 @@ #include #include -#include "nlohmann/json.hpp" -#include "bpetokenizer.hpp" -#include "string_tensor.h" -#include "unicode.h" - -// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace) -bool IsUnicodeSpace(char32_t ch) { - switch (ch) { - case 0x0009: - case 0x000A: - case 0x000B: - case 0x000C: - case 0x000D: - case 0x001C: - case 0x001D: - case 0x001E: - case 0x001F: - case 0x0020: - case 0x0085: - case 0x00A0: - case 0x1680: - case 0x2000: - case 0x2001: - case 0x2002: - case 0x2003: - case 0x2004: - case 0x2005: - case 0x2006: - case 0x2007: - case 0x2008: - case 0x2009: - case 0x200A: - case 0x2028: - case 0x2029: - case 0x202F: - case 0x205F: - case 0x3000: - return true; - } - return false; -} +#include "gpt2_tokenizer.hpp" -bool IsEmptyUString(const ustring& str) { - return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); }); -} KernelBpeTokenizer::KernelBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info) : BaseKernel(api, info) { diff --git a/operators/tokenizer/gpt2_tokenizer.hpp b/operators/tokenizer/gpt2_tokenizer.hpp index ed1625d98..1eeb40246 100644 --- a/operators/tokenizer/gpt2_tokenizer.hpp +++ b/operators/tokenizer/gpt2_tokenizer.hpp @@ -1,8 +1,7 @@ #include #include "ocos.h" #include "ustring.h" - -class VocabData; +#include "bpetokenizer.hpp" struct KernelBpeTokenizer : BaseKernel { KernelBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info); diff --git a/operators/tokenizer/roberta_tokenizer.cc b/operators/tokenizer/roberta_tokenizer.cc index 4737c054e..eb1bc8f81 100644 --- a/operators/tokenizer/roberta_tokenizer.cc +++ b/operators/tokenizer/roberta_tokenizer.cc @@ -18,51 +18,7 @@ #include #include -#include "nlohmann/json.hpp" -#include "bpetokenizer.hpp" -#include "string_tensor.h" -#include "unicode.h" - -// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace) -bool IsWithinUnicodeSpace(char32_t ch) { - switch (ch) { - case 0x0009: - case 0x000A: - case 0x000B: - case 0x000C: - case 0x000D: - case 0x001C: - case 0x001D: - case 0x001E: - case 0x001F: - case 0x0020: - case 0x0085: - case 0x00A0: - case 0x1680: - case 0x2000: - case 0x2001: - case 0x2002: - case 0x2003: - case 0x2004: - case 0x2005: - case 0x2006: - case 0x2007: - case 0x2008: - case 0x2009: - case 0x200A: - case 0x2028: - case 0x2029: - case 0x202F: - case 0x205F: - case 0x3000: - return true; - } - return false; -} - -bool IsEmptyuString(const ustring& str) { - return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsWithinUnicodeSpace(ch); }); -} +#include "roberta_tokenizer.hpp" KernelRobertaBpeTokenizer::KernelRobertaBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info) : BaseKernel(api, info) { @@ -93,7 +49,7 @@ KernelRobertaBpeTokenizer::KernelRobertaBpeTokenizer(const OrtApi& api, const Or std::vector KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t max_length, std::list>>& offset_map) { std::vector res; - if (IsEmptyuString(input)) { + if (IsEmptyUString(input)) { return res; } // Add BOS token to result diff --git a/operators/tokenizer/roberta_tokenizer.hpp b/operators/tokenizer/roberta_tokenizer.hpp index e0252a304..6ab4fecbe 100644 --- a/operators/tokenizer/roberta_tokenizer.hpp +++ b/operators/tokenizer/roberta_tokenizer.hpp @@ -2,8 +2,7 @@ #include "ocos.h" #include "ustring.h" #include "string_utils.h" - -class VocabData; +#include "bpetokenizer.hpp" struct KernelRobertaBpeTokenizer : BaseKernel { KernelRobertaBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info); From 371b6e6735d83666c2b2567de825f448f46c9e6c Mon Sep 17 00:00:00 2001 From: Wenbing Li Date: Mon, 27 Feb 2023 21:13:53 -0800 Subject: [PATCH 2/8] code cleanup and the warnings fixing. --- .gitignore | 2 +- .pipelines/ci.yml | 15 ++++++++ base/string_tensor.cc | 2 +- base/string_tensor.h | 2 +- base/ustring.cc | 3 +- base/ustring.h | 7 +--- includes/ocos.h | 2 + operators/tokenizer/basic_tokenizer.cc | 3 +- operators/tokenizer/bert_tokenizer.hpp | 2 - .../tokenizer/bert_tokenizer_decoder.hpp | 2 - operators/tokenizer/bpetokenizer.hpp | 16 ++------ operators/tokenizer/clip_tokenizer.cc | 18 +-------- operators/tokenizer/clip_tokenizer.hpp | 7 ++-- operators/tokenizer/gpt2_tokenizer.cc | 29 -------------- operators/tokenizer/gpt2_tokenizer.hpp | 7 ++-- operators/tokenizer/roberta_tokenizer.cc | 38 ++++++------------- operators/tokenizer/roberta_tokenizer.hpp | 11 +++--- operators/tokenizer/wordpiece_tokenizer.hpp | 2 - test/test_cliptok.py | 4 +- test/test_gpt2tok.py | 6 --- test/test_robertatok.py | 4 +- 21 files changed, 59 insertions(+), 123 deletions(-) diff --git a/.gitignore b/.gitignore index 2377dfe4f..0905bbd98 100644 --- a/.gitignore +++ b/.gitignore @@ -38,7 +38,7 @@ out/ .scb/ onnxruntime_extensions/_version.py onnxruntime-*-*-*/ -temp_*.onnx +temp_*onnx* # Java specific ignores */.gradle java/hs_*.log diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml index a14653aa3..37516ad28 100644 --- a/.pipelines/ci.yml +++ b/.pipelines/ci.yml @@ -478,3 +478,18 @@ jobs: echo "Exception propogation was not enabled correctly." exit 1 fi + + + ############################## + # Linux for selected_ops build + ############################## + - job: Linux + pool: + vmImage: 'ubuntu-latest' + + steps: + # compiled as only one operator selected. + - script: | + set -e -x -u + echo 'set (OCOS_ENABLE_BERT_TOKENIZER ON CACHE BOOL "" FORCE)' > cmake/_selectedoplist.cmake + ./build.sh -DOCOS_ENABLE_CPP_EXCEPTIONS=OFF -DOCOS_ENABLE_SELECTED_OPLIST=ON diff --git a/base/string_tensor.cc b/base/string_tensor.cc index 9e832c1c5..569a39e7b 100644 --- a/base/string_tensor.cc +++ b/base/string_tensor.cc @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "string_tensor.h" #include "string_utils.h" #include "ustring.h" -#include "string_tensor.h" void GetTensorMutableDataString(const OrtApi& api, OrtW::CustomOpApi& ort, OrtKernelContext* context, const OrtValue* value, std::vector& output) { diff --git a/base/string_tensor.h b/base/string_tensor.h index 6b99fe1c0..469db8da8 100644 --- a/base/string_tensor.h +++ b/base/string_tensor.h @@ -3,8 +3,8 @@ #pragma once -#include #include "ocos.h" +#include // Retrieves a vector of strings if the input type is std::string. diff --git a/base/ustring.cc b/base/ustring.cc index 1a06392bf..9ac9a8eb0 100644 --- a/base/ustring.cc +++ b/base/ustring.cc @@ -1,7 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include #include "ustring.h" +#include + ustring::ustring() : std::u32string() { } diff --git a/base/ustring.h b/base/ustring.h index cf817df56..e20a21053 100644 --- a/base/ustring.h +++ b/base/ustring.h @@ -1,14 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #pragma once -#include -#include -#include - -#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING 1 -#include #include "ocos.h" +#include // ustring needs a new implementation, due to the std::codecvt deprecation. // Wrap u32string with ustring, in case we will use other implementation in the future diff --git a/includes/ocos.h b/includes/ocos.h index b61d1c585..32be5a69a 100644 --- a/includes/ocos.h +++ b/includes/ocos.h @@ -3,6 +3,8 @@ #pragma once +#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING +#include #include #include #include diff --git a/operators/tokenizer/basic_tokenizer.cc b/operators/tokenizer/basic_tokenizer.cc index d57ae1d4b..ce87c67e7 100644 --- a/operators/tokenizer/basic_tokenizer.cc +++ b/operators/tokenizer/basic_tokenizer.cc @@ -1,12 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "string_utils.h" #include "basic_tokenizer.hpp" +#include "string_utils.h" #include "string_tensor.h" #include #include -#include #include BasicTokenizer::BasicTokenizer(bool do_lower_case, bool tokenize_chinese_chars, bool strip_accents, diff --git a/operators/tokenizer/bert_tokenizer.hpp b/operators/tokenizer/bert_tokenizer.hpp index 7b3f320b7..bd78d2635 100644 --- a/operators/tokenizer/bert_tokenizer.hpp +++ b/operators/tokenizer/bert_tokenizer.hpp @@ -3,8 +3,6 @@ #pragma once -#include -#include #include "ocos.h" #include "ustring.h" #include "string_utils.h" diff --git a/operators/tokenizer/bert_tokenizer_decoder.hpp b/operators/tokenizer/bert_tokenizer_decoder.hpp index a0863d434..c1e20b961 100644 --- a/operators/tokenizer/bert_tokenizer_decoder.hpp +++ b/operators/tokenizer/bert_tokenizer_decoder.hpp @@ -3,8 +3,6 @@ #pragma once -#include -#include #include "ocos.h" #include "ustring.h" #include "string_utils.h" diff --git a/operators/tokenizer/bpetokenizer.hpp b/operators/tokenizer/bpetokenizer.hpp index 0b7d093b7..5907c5615 100644 --- a/operators/tokenizer/bpetokenizer.hpp +++ b/operators/tokenizer/bpetokenizer.hpp @@ -1,24 +1,15 @@ // Licensed under the MIT License. // Partial code comes from other Microsoft employee. #pragma once -#include -#include -#include -#include -#include -#include -#include -#include +#include "ustring.h" + #include -#include -#include #include #include -#include -#include #include "unicode.h" #include "nlohmann/json.hpp" +#include "string_utils.h" #include "string_tensor.h" // Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace) @@ -155,7 +146,6 @@ class VocabData { } else { int id = static_cast(vocab_map_.size()); vocab_map_[unk_token] = id; - std::cerr << "Special token (" << unk_token << ") have been added in the vocabulary." << std::endl; } std::wstring_convert, char32_t> str_convert; diff --git a/operators/tokenizer/clip_tokenizer.cc b/operators/tokenizer/clip_tokenizer.cc index 434daf901..1f565c820 100644 --- a/operators/tokenizer/clip_tokenizer.cc +++ b/operators/tokenizer/clip_tokenizer.cc @@ -1,24 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. // Partial code comes from other Microsoft employee. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - #include "clip_tokenizer.hpp" +#include "string_utils.h" KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info) : BaseKernel(api, info) { diff --git a/operators/tokenizer/clip_tokenizer.hpp b/operators/tokenizer/clip_tokenizer.hpp index 8cf4b0c58..d5387919c 100644 --- a/operators/tokenizer/clip_tokenizer.hpp +++ b/operators/tokenizer/clip_tokenizer.hpp @@ -1,8 +1,7 @@ -#include -#include "ocos.h" -#include "ustring.h" -#include "string_utils.h" +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#pragma once #include "bpetokenizer.hpp" struct KernelClipBpeTokenizer : BaseKernel { diff --git a/operators/tokenizer/gpt2_tokenizer.cc b/operators/tokenizer/gpt2_tokenizer.cc index 2f0501b76..2a3234c57 100644 --- a/operators/tokenizer/gpt2_tokenizer.cc +++ b/operators/tokenizer/gpt2_tokenizer.cc @@ -2,21 +2,6 @@ // Licensed under the MIT License. // Partial code comes from other Microsoft employee. -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - #include "gpt2_tokenizer.hpp" @@ -157,17 +142,3 @@ size_t CustomOpBpeTokenizer::GetOutputTypeCount() const { ONNXTensorElementDataType CustomOpBpeTokenizer::GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; } - -const OrtCustomOp** LoadTokenizerSchemaList() { - // create the global objects here to let the ORT catch the expection if any - static std::unique_ptr p_CoBpeTokenizer; - static const OrtCustomOp* c_CustomOpList[2] = {nullptr}; // {&c_CoBpeTokenizer, nullptr}; - static std::mutex mtx_loaded; - std::lock_guard lck(mtx_loaded); - if (p_CoBpeTokenizer.get() == nullptr) { - p_CoBpeTokenizer = std::make_unique(); - c_CustomOpList[0] = p_CoBpeTokenizer.get(); - } - - return c_CustomOpList; -} diff --git a/operators/tokenizer/gpt2_tokenizer.hpp b/operators/tokenizer/gpt2_tokenizer.hpp index 1eeb40246..31b2bd2d2 100644 --- a/operators/tokenizer/gpt2_tokenizer.hpp +++ b/operators/tokenizer/gpt2_tokenizer.hpp @@ -1,6 +1,7 @@ -#include -#include "ocos.h" -#include "ustring.h" +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once #include "bpetokenizer.hpp" struct KernelBpeTokenizer : BaseKernel { diff --git a/operators/tokenizer/roberta_tokenizer.cc b/operators/tokenizer/roberta_tokenizer.cc index eb1bc8f81..c886ce7e7 100644 --- a/operators/tokenizer/roberta_tokenizer.cc +++ b/operators/tokenizer/roberta_tokenizer.cc @@ -2,23 +2,9 @@ // Licensed under the MIT License. // Partial code comes from other Microsoft employee. -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - #include "roberta_tokenizer.hpp" +#include "narrow.h" + KernelRobertaBpeTokenizer::KernelRobertaBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info) : BaseKernel(api, info) { @@ -46,7 +32,7 @@ KernelRobertaBpeTokenizer::KernelRobertaBpeTokenizer(const OrtApi& api, const Or bbpe_tokenizer_->Load(vocabu_stream, merges_stream, "<|endoftext|>", "<|endoftext|>"); } -std::vector KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t max_length, std::list>>& offset_map) { +std::vector KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t max_length, std::list& offset_map) { std::vector res; if (IsEmptyUString(input)) { @@ -72,8 +58,8 @@ std::vector KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t const char32_t* ptr = cur_input.c_str(); regcmp.Set(ptr); - int offset = 0; - std::list> offset_mapping; + size_t offset = 0; + OffsetMappingType offset_mapping; // Add offset mapping for BOS token offset_mapping.push_back(std::make_pair(0, 0)); @@ -86,16 +72,16 @@ std::vector KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t // Handle offset mapping and special cases if (utf8_token.at(0) == ' ') { - offset_mapping.push_back(std::make_pair(offset + 1, offset + utf8_token.size())); + offset_mapping.emplace_back(std::make_pair(offset + 1, ort_extensions::narrow(offset + utf8_token.size()))); } else { - offset_mapping.push_back(std::make_pair(offset, offset + utf8_token.size())); + offset_mapping.emplace_back(std::make_pair(offset, ort_extensions::narrow(offset + utf8_token.size()))); } offset += utf8_token.size(); // Get byte encodings prior to performing BPE byte_list_.clear(); for (char& cp : utf8_token) { - byte_list_.push_back(bbpe_tokenizer_->ByteEncoder()[static_cast(cp)]); + byte_list_.emplace_back(bbpe_tokenizer_->ByteEncoder()[static_cast(cp)]); } // Perform BPE @@ -111,13 +97,13 @@ std::vector KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t } } // Add offset mapping for EOS token - offset_mapping.push_back(std::make_pair(0, 0)); + offset_mapping.emplace_back(std::make_pair(0, 0)); // Add offset mappings for input in this instance to list of offset mappings for all inputs - offset_map.push_back(offset_mapping); + offset_map.emplace_back(offset_mapping); } // Add EOS token to result - res.push_back(bbpe_tokenizer_->GetEncoding("")); + res.emplace_back(bbpe_tokenizer_->GetEncoding("")); return res; } @@ -125,7 +111,7 @@ void KernelRobertaBpeTokenizer::Compute(OrtKernelContext* context) { // Setup inputs const OrtValue* input = ort_.KernelContext_GetInput(context, 0); std::vector str_input; - std::list>> offset_map; + std::list offset_map; GetTensorMutableDataString(api_, ort_, context, input, str_input); OrtTensorDimensions input_dim(ort_, input); diff --git a/operators/tokenizer/roberta_tokenizer.hpp b/operators/tokenizer/roberta_tokenizer.hpp index 6ab4fecbe..b499b6866 100644 --- a/operators/tokenizer/roberta_tokenizer.hpp +++ b/operators/tokenizer/roberta_tokenizer.hpp @@ -1,7 +1,7 @@ -#include -#include "ocos.h" -#include "ustring.h" -#include "string_utils.h" +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once #include "bpetokenizer.hpp" struct KernelRobertaBpeTokenizer : BaseKernel { @@ -9,7 +9,8 @@ struct KernelRobertaBpeTokenizer : BaseKernel { void Compute(OrtKernelContext* context); private: - std::vector Tokenize(ustring& input, int64_t max_length, std::list>>& offset_map); + using OffsetMappingType = std::list>; + std::vector Tokenize(ustring& input, int64_t max_length, std::list& offset_map); int64_t padding_length_; std::list byte_list_; diff --git a/operators/tokenizer/wordpiece_tokenizer.hpp b/operators/tokenizer/wordpiece_tokenizer.hpp index 1bdb8d893..298a5b197 100644 --- a/operators/tokenizer/wordpiece_tokenizer.hpp +++ b/operators/tokenizer/wordpiece_tokenizer.hpp @@ -3,8 +3,6 @@ #pragma once -#include -#include #include "ocos.h" #include "ustring.h" #include "string_utils.h" diff --git a/test/test_cliptok.py b/test/test_cliptok.py index ce8fc49c2..028eecd56 100644 --- a/test/test_cliptok.py +++ b/test/test_cliptok.py @@ -39,7 +39,9 @@ class TestCLIPTokenizer(unittest.TestCase): @classmethod def setUpClass(cls): cls.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") - files = cls.tokenizer.save_vocabulary(".") + temp_dir = Path('./temp_onnxclip') + temp_dir.mkdir(parents=True, exist_ok=True) + files = cls.tokenizer.save_vocabulary(str(temp_dir)) cls.tokjson = files[0] cls.merges = files[1] diff --git a/test/test_gpt2tok.py b/test/test_gpt2tok.py index 2fae5988c..6b1988c68 100644 --- a/test/test_gpt2tok.py +++ b/test/test_gpt2tok.py @@ -2,7 +2,6 @@ import numpy as np import onnxruntime as _ort -from pathlib import Path from onnx import helper, onnx_pb as onnx_proto from transformers import GPT2Tokenizer from onnxruntime_extensions import ( @@ -17,11 +16,6 @@ def _get_file_content(path): return file.read() -def _get_test_data_file(*sub_dirs): - test_dir = Path(__file__).parent - return str(test_dir.joinpath(*sub_dirs)) - - def _create_test_model(**kwargs): vocab_file = kwargs["vocab_file"] merges_file = kwargs["merges_file"] diff --git a/test/test_robertatok.py b/test/test_robertatok.py index 053a23ac6..4eb2681e1 100644 --- a/test/test_robertatok.py +++ b/test/test_robertatok.py @@ -42,7 +42,9 @@ class TestRobertaTokenizer(unittest.TestCase): @classmethod def setUpClass(cls): cls.tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") - files = cls.tokenizer.save_vocabulary(".") + temp_dir = Path('./temp_onnxroberta') + temp_dir.mkdir(parents=True, exist_ok=True) + files = cls.tokenizer.save_vocabulary(str(temp_dir)) cls.tokjson = files[0] cls.merges = files[1] From 7b2916987d3cecf1f46780ee22837cd4b8282e3e Mon Sep 17 00:00:00 2001 From: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Date: Mon, 27 Feb 2023 22:38:30 -0800 Subject: [PATCH 3/8] Update ci.yml for Azure Pipelines --- .pipelines/ci.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml index 37516ad28..0f2f4cde1 100644 --- a/.pipelines/ci.yml +++ b/.pipelines/ci.yml @@ -489,7 +489,8 @@ jobs: steps: # compiled as only one operator selected. - - script: | - set -e -x -u - echo 'set (OCOS_ENABLE_BERT_TOKENIZER ON CACHE BOOL "" FORCE)' > cmake/_selectedoplist.cmake - ./build.sh -DOCOS_ENABLE_CPP_EXCEPTIONS=OFF -DOCOS_ENABLE_SELECTED_OPLIST=ON + - bash: | + set -e -x -u + echo 'set (OCOS_ENABLE_BERT_TOKENIZER ON CACHE BOOL "" FORCE)' > cmake/_selectedoplist.cmake + ./build.sh -DOCOS_ENABLE_CPP_EXCEPTIONS=OFF -DOCOS_ENABLE_SELECTED_OPLIST=ON + displayName: Build ort-extensions with only one operator was selected From 0bf443036d9a008ffc18ec32c155549a5f6dc562 Mon Sep 17 00:00:00 2001 From: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Date: Mon, 27 Feb 2023 22:40:31 -0800 Subject: [PATCH 4/8] Update ci.yml for Azure Pipelines --- .pipelines/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml index 0f2f4cde1..5a50296a6 100644 --- a/.pipelines/ci.yml +++ b/.pipelines/ci.yml @@ -483,7 +483,7 @@ jobs: ############################## # Linux for selected_ops build ############################## - - job: Linux + - job: Linux_SelectedOpsBuild pool: vmImage: 'ubuntu-latest' From 6ff1bb20043ba9c9507872c5f0ae96802ea7ace6 Mon Sep 17 00:00:00 2001 From: Wenbing Li Date: Tue, 28 Feb 2023 10:10:02 -0800 Subject: [PATCH 5/8] fix linux build --- base/ustring.h | 1 + 1 file changed, 1 insertion(+) diff --git a/base/ustring.h b/base/ustring.h index e20a21053..e20f90210 100644 --- a/base/ustring.h +++ b/base/ustring.h @@ -3,6 +3,7 @@ #pragma once #include "ocos.h" +#include #include // ustring needs a new implementation, due to the std::codecvt deprecation. From 7d3c87696e9d2586ed63622ca0f26b0c0a3ec02f Mon Sep 17 00:00:00 2001 From: Wenbing Li Date: Tue, 28 Feb 2023 11:05:39 -0800 Subject: [PATCH 6/8] one more fixing --- operators/tokenizer/bpetokenizer.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/operators/tokenizer/bpetokenizer.hpp b/operators/tokenizer/bpetokenizer.hpp index 5907c5615..fcc8e1f63 100644 --- a/operators/tokenizer/bpetokenizer.hpp +++ b/operators/tokenizer/bpetokenizer.hpp @@ -1,11 +1,12 @@ // Licensed under the MIT License. // Partial code comes from other Microsoft employee. #pragma once +#include "ocos.h" #include "ustring.h" #include +#include #include -#include #include "unicode.h" #include "nlohmann/json.hpp" From 49fbf414cdc2042d065c02b52e9897046a3effa5 Mon Sep 17 00:00:00 2001 From: Wenbing Li Date: Tue, 28 Feb 2023 11:22:04 -0800 Subject: [PATCH 7/8] again? --- operators/tokenizer/wordpiece_tokenizer.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/operators/tokenizer/wordpiece_tokenizer.hpp b/operators/tokenizer/wordpiece_tokenizer.hpp index 298a5b197..a34b82cdb 100644 --- a/operators/tokenizer/wordpiece_tokenizer.hpp +++ b/operators/tokenizer/wordpiece_tokenizer.hpp @@ -8,6 +8,9 @@ #include "string_utils.h" #include "string_tensor.h" +#include + + struct KernelWordpieceTokenizer : BaseKernel { KernelWordpieceTokenizer(const OrtApi& api, const OrtKernelInfo& info); void Compute(OrtKernelContext* context); From 37ee047394420f06e13a5ca5fc2d40d30c217afd Mon Sep 17 00:00:00 2001 From: Wenbing Li Date: Tue, 28 Feb 2023 14:04:10 -0800 Subject: [PATCH 8/8] fixing for macOS --- operators/tokenizer/bert_tokenizer.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/operators/tokenizer/bert_tokenizer.hpp b/operators/tokenizer/bert_tokenizer.hpp index bd78d2635..4faddd067 100644 --- a/operators/tokenizer/bert_tokenizer.hpp +++ b/operators/tokenizer/bert_tokenizer.hpp @@ -9,6 +9,9 @@ #include "string_tensor.h" #include "basic_tokenizer.hpp" +#include + + class BertTokenizerVocab final { public: explicit BertTokenizerVocab(std::string_view vocab);