microsoft · wenbingl · Feb 28, 2023 · Feb 28, 2023 · Feb 28, 2023 · Feb 28, 2023
@@ -38,7 +38,7 @@ out/
 .scb/
 onnxruntime_extensions/_version.py
 onnxruntime-*-*-*/
-temp_*.onnx
+temp_*onnx*
 # Java specific ignores
 */.gradle
 java/hs_*.log

@@ -478,3 +478,19 @@ jobs:
             echo "Exception propogation was not enabled correctly."
             exit 1
           fi
+
+
+  ##############################
+  # Linux for selected_ops build
+  ##############################
+  - job: Linux_SelectedOpsBuild
+    pool:
+      vmImage: 'ubuntu-latest'
+
+    steps:
+      # compiled as only one operator selected.
+      - bash: |
+          set -e -x -u
+          echo 'set (OCOS_ENABLE_BERT_TOKENIZER ON CACHE BOOL "" FORCE)' > cmake/_selectedoplist.cmake
+          ./build.sh -DOCOS_ENABLE_CPP_EXCEPTIONS=OFF -DOCOS_ENABLE_SELECTED_OPLIST=ON
+        displayName: Build ort-extensions with only one operator was selected
@@ -208,6 +208,9 @@ endif()
 
 if(NOT OCOS_ENABLE_CPP_EXCEPTIONS)
   add_compile_definitions(OCOS_NO_EXCEPTIONS ORT_NO_EXCEPTIONS)
+  if (NOT _ONNXRUNTIME_EMBEDDED)
+    add_compile_definitions(_HAS_EXCEPTIONS=0)
+  endif()
 endif()
 
 include(FetchContent)
@@ -254,7 +257,7 @@ if(OCOS_ENABLE_RE2_REGEX)
 endif()
 
 # ### scan all source files
-set(TARGET_SRC_NOEXCEPTION)
+file(GLOB TARGET_SRC_NOEXCEPTION "base/*.h" "base/*.cc")
 file(GLOB TARGET_SRC "operators/*.cc" "operators/*.h" "includes/*.h*")
 
 if(OCOS_ENABLE_TF_STRING)
@@ -402,11 +405,13 @@ standardize_output_folder(ocos_operators)
 target_include_directories(noexcep_operators PUBLIC
   ${ONNXRUNTIME_INCLUDE_DIR}
   ${PROJECT_SOURCE_DIR}/includes
+  ${PROJECT_SOURCE_DIR}/base
   ${PROJECT_SOURCE_DIR}/operators)
 
 target_include_directories(ocos_operators PUBLIC
   ${ONNXRUNTIME_INCLUDE_DIR}
   ${PROJECT_SOURCE_DIR}/includes
+  ${PROJECT_SOURCE_DIR}/base
   ${PROJECT_SOURCE_DIR}/operators
   ${PROJECT_SOURCE_DIR}/operators/tokenizer)
 

@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+#include "string_tensor.h"
 #include "string_utils.h"
 #include "ustring.h"
-#include "string_tensor.h"
 
 void GetTensorMutableDataString(const OrtApi& api, OrtW::CustomOpApi& ort, OrtKernelContext* context,
                                 const OrtValue* value, std::vector<std::string>& output) {

@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <string>
 #include "ocos.h"
+#include <string>
 
 
 // Retrieves a vector of strings if the input type is std::string.

@@ -1,7 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-#include <iostream>
 #include "ustring.h"
+#include <iostream>
+
 
 ustring::ustring() : std::u32string() {
 }

@@ -1,15 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #pragma once
-#include <string>
-#include <locale>
-#include <functional>
-
-#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING 1
-#include <codecvt>
 
 #include "ocos.h"
+#include <locale>
+#include <codecvt>
 
+// ustring needs a new implementation, due to the std::codecvt deprecation.
 // Wrap u32string with ustring, in case we will use other implementation in the future
 class ustring : public std::u32string {
  public:

@@ -3,6 +3,8 @@
 
 #pragma once
 
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#include <string>
 #include <algorithm>
 #include <functional>
 #include <iterator>

@@ -1,12 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "string_utils.h"
 #include "basic_tokenizer.hpp"
+#include "string_utils.h"
 #include "string_tensor.h"
 #include <vector>
 #include <locale>
-#include <codecvt>
 #include <algorithm>
 
 BasicTokenizer::BasicTokenizer(bool do_lower_case, bool tokenize_chinese_chars, bool strip_accents,

@@ -3,8 +3,6 @@
 
 #pragma once
 
-#include <unordered_map>
-#include <vector>
 #include "ocos.h"
 #include "ustring.h"
 #include "string_utils.h"

@@ -3,8 +3,6 @@
 
 #pragma once
 
-#include <unordered_map>
-#include <vector>
 #include "ocos.h"
 #include "ustring.h"
 #include "string_utils.h"

@@ -1,28 +1,58 @@
 // Licensed under the MIT License.
 // Partial code comes from other Microsoft employee.
 #pragma once
-#include <string>
-#include <vector>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <algorithm>
-#include <list>
-#include <memory>
+#include "ocos.h"
+#include "ustring.h"
+
 #include <regex>
-#include <sstream>
-#include <stdexcept>
+#include <list>
 #include <unordered_map>
-#include <functional>
-#include <codecvt>
-#include <mutex>
 
+#include "unicode.h"
 #include "nlohmann/json.hpp"
-#include "clip_tokenizer.hpp"
-#include "gpt2_tokenizer.hpp"
-#include "roberta_tokenizer.hpp"
+#include "string_utils.h"
 #include "string_tensor.h"
-#include "unicode.h"
+
+// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)
+inline bool IsUnicodeSpace(char32_t ch) {
+  switch (ch) {
+    case 0x0009:
+    case 0x000A:
+    case 0x000B:
+    case 0x000C:
+    case 0x000D:
+    case 0x001C:
+    case 0x001D:
+    case 0x001E:
+    case 0x001F:
+    case 0x0020:
+    case 0x0085:
+    case 0x00A0:
+    case 0x1680:
+    case 0x2000:
+    case 0x2001:
+    case 0x2002:
+    case 0x2003:
+    case 0x2004:
+    case 0x2005:
+    case 0x2006:
+    case 0x2007:
+    case 0x2008:
+    case 0x2009:
+    case 0x200A:
+    case 0x2028:
+    case 0x2029:
+    case 0x202F:
+    case 0x205F:
+    case 0x3000:
+      return true;
+  }
+  return false;
+}
+
+inline bool IsEmptyUString(const ustring& str) {
+  return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); });
+}
 
 class SpecialTokenMap {
  public:
@@ -117,7 +147,6 @@ class VocabData {
     } else {
       int id = static_cast<int>(vocab_map_.size());
       vocab_map_[unk_token] = id;
-      std::cerr << "Special token (" << unk_token << ") have been added in the vocabulary." << std::endl;
     }
 
     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> str_convert;

@@ -1,68 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 // Partial code comes from other Microsoft employee.
-
-#include <string>
-#include <vector>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <algorithm>
-#include <list>
-#include <memory>
-#include <regex>
-#include <sstream>
-#include <stdexcept>
-#include <unordered_map>
-#include <functional>
-#include <codecvt>
-#include <mutex>
-
-#include "nlohmann/json.hpp"
-#include "bpetokenizer.hpp"
-#include "string_tensor.h"
-#include "unicode.h"
-
-// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)
-bool IsInUnicodeSpace(char32_t ch) {
-  switch (ch) {
-    case 0x0009:
-    case 0x000A:
-    case 0x000B:
-    case 0x000C:
-    case 0x000D:
-    case 0x001C:
-    case 0x001D:
-    case 0x001E:
-    case 0x001F:
-    case 0x0020:
-    case 0x0085:
-    case 0x00A0:
-    case 0x1680:
-    case 0x2000:
-    case 0x2001:
-    case 0x2002:
-    case 0x2003:
-    case 0x2004:
-    case 0x2005:
-    case 0x2006:
-    case 0x2007:
-    case 0x2008:
-    case 0x2009:
-    case 0x200A:
-    case 0x2028:
-    case 0x2029:
-    case 0x202F:
-    case 0x205F:
-    case 0x3000:
-      return true;
-  }
-  return false;
-}
-
-bool IsEmptyUstring(const ustring& str) {
-  return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsInUnicodeSpace(ch); });
-}
+#include "clip_tokenizer.hpp"
+#include "string_utils.h"
 
 KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info)
     : BaseKernel(api, info) {
@@ -93,7 +33,7 @@ KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKerne
 std::vector<int64_t> KernelClipBpeTokenizer::Tokenize(ustring& input, int64_t max_length) {
   std::vector<int64_t> res;
 
-  if (IsEmptyUstring(input)) {
+  if (IsEmptyUString(input)) {
     return res;
   }
   // Add <|startoftext|> token to result

@@ -1,9 +1,8 @@
-#include <list>
-#include "ocos.h"
-#include "ustring.h"
-#include "string_utils.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
 
-class VocabData;
+#pragma once
+#include "bpetokenizer.hpp"
 
 struct KernelClipBpeTokenizer : BaseKernel {
   KernelClipBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info);