From f2dcd19f5d703b8ab034e9e36f6228af01e81763 Mon Sep 17 00:00:00 2001
From: Wenbing Li <wenbingl@outlook.com>
Date: Mon, 27 Feb 2023 18:06:05 -0800
Subject: [PATCH 1/8] fix the break in release pipeline

---
 CMakeLists.txt                            |  7 +++-
 {operators => base}/base64.cc             |  0
 {operators => base}/base64.h              |  0
 {operators => base}/narrow.h              |  0
 {operators => base}/ocos.cc               |  0
 {operators => base}/string_tensor.cc      |  0
 {operators => base}/string_tensor.h       |  0
 {operators => base}/string_utils.cc       |  0
 {operators => base}/string_utils.h        |  0
 {operators => base}/ustring.cc            |  0
 {operators => base}/ustring.h             |  1 +
 operators/tokenizer/bpetokenizer.hpp      | 46 ++++++++++++++++++++--
 operators/tokenizer/clip_tokenizer.cc     | 48 +----------------------
 operators/tokenizer/clip_tokenizer.hpp    |  2 +-
 operators/tokenizer/gpt2_tokenizer.cc     | 45 +--------------------
 operators/tokenizer/gpt2_tokenizer.hpp    |  3 +-
 operators/tokenizer/roberta_tokenizer.cc  | 48 +----------------------
 operators/tokenizer/roberta_tokenizer.hpp |  3 +-
 18 files changed, 57 insertions(+), 146 deletions(-)
 rename {operators => base}/base64.cc (100%)
 rename {operators => base}/base64.h (100%)
 rename {operators => base}/narrow.h (100%)
 rename {operators => base}/ocos.cc (100%)
 rename {operators => base}/string_tensor.cc (100%)
 rename {operators => base}/string_tensor.h (100%)
 rename {operators => base}/string_utils.cc (100%)
 rename {operators => base}/string_utils.h (100%)
 rename {operators => base}/ustring.cc (100%)
 rename {operators => base}/ustring.h (96%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 38be4d988..db5d5846f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,6 +208,9 @@ endif()
 
 if(NOT OCOS_ENABLE_CPP_EXCEPTIONS)
   add_compile_definitions(OCOS_NO_EXCEPTIONS ORT_NO_EXCEPTIONS)
+  if (NOT _ONNXRUNTIME_EMBEDDED)
+    add_compile_definitions(_HAS_EXCEPTIONS=0)
+  endif()
 endif()
 
 include(FetchContent)
@@ -254,7 +257,7 @@ if(OCOS_ENABLE_RE2_REGEX)
 endif()
 
 # ### scan all source files
-set(TARGET_SRC_NOEXCEPTION)
+file(GLOB TARGET_SRC_NOEXCEPTION "base/*.h" "base/*.cc")
 file(GLOB TARGET_SRC "operators/*.cc" "operators/*.h" "includes/*.h*")
 
 if(OCOS_ENABLE_TF_STRING)
@@ -402,11 +405,13 @@ standardize_output_folder(ocos_operators)
 target_include_directories(noexcep_operators PUBLIC
   ${ONNXRUNTIME_INCLUDE_DIR}
   ${PROJECT_SOURCE_DIR}/includes
+  ${PROJECT_SOURCE_DIR}/base
   ${PROJECT_SOURCE_DIR}/operators)
 
 target_include_directories(ocos_operators PUBLIC
   ${ONNXRUNTIME_INCLUDE_DIR}
   ${PROJECT_SOURCE_DIR}/includes
+  ${PROJECT_SOURCE_DIR}/base
   ${PROJECT_SOURCE_DIR}/operators
   ${PROJECT_SOURCE_DIR}/operators/tokenizer)
 
diff --git a/operators/base64.cc b/base/base64.cc
similarity index 100%
rename from operators/base64.cc
rename to base/base64.cc
diff --git a/operators/base64.h b/base/base64.h
similarity index 100%
rename from operators/base64.h
rename to base/base64.h
diff --git a/operators/narrow.h b/base/narrow.h
similarity index 100%
rename from operators/narrow.h
rename to base/narrow.h
diff --git a/operators/ocos.cc b/base/ocos.cc
similarity index 100%
rename from operators/ocos.cc
rename to base/ocos.cc
diff --git a/operators/string_tensor.cc b/base/string_tensor.cc
similarity index 100%
rename from operators/string_tensor.cc
rename to base/string_tensor.cc
diff --git a/operators/string_tensor.h b/base/string_tensor.h
similarity index 100%
rename from operators/string_tensor.h
rename to base/string_tensor.h
diff --git a/operators/string_utils.cc b/base/string_utils.cc
similarity index 100%
rename from operators/string_utils.cc
rename to base/string_utils.cc
diff --git a/operators/string_utils.h b/base/string_utils.h
similarity index 100%
rename from operators/string_utils.h
rename to base/string_utils.h
diff --git a/operators/ustring.cc b/base/ustring.cc
similarity index 100%
rename from operators/ustring.cc
rename to base/ustring.cc
diff --git a/operators/ustring.h b/base/ustring.h
similarity index 96%
rename from operators/ustring.h
rename to base/ustring.h
index 9e960ee70..cf817df56 100644
--- a/operators/ustring.h
+++ b/base/ustring.h
@@ -10,6 +10,7 @@
 
 #include "ocos.h"
 
+// ustring needs a new implementation, due to the std::codecvt deprecation.
 // Wrap u32string with ustring, in case we will use other implementation in the future
 class ustring : public std::u32string {
  public:
diff --git a/operators/tokenizer/bpetokenizer.hpp b/operators/tokenizer/bpetokenizer.hpp
index 269ad8119..0b7d093b7 100644
--- a/operators/tokenizer/bpetokenizer.hpp
+++ b/operators/tokenizer/bpetokenizer.hpp
@@ -17,12 +17,50 @@
 #include <codecvt>
 #include <mutex>
 
+#include "unicode.h"
 #include "nlohmann/json.hpp"
-#include "clip_tokenizer.hpp"
-#include "gpt2_tokenizer.hpp"
-#include "roberta_tokenizer.hpp"
 #include "string_tensor.h"
-#include "unicode.h"
+
+// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)
+inline bool IsUnicodeSpace(char32_t ch) {
+  switch (ch) {
+    case 0x0009:
+    case 0x000A:
+    case 0x000B:
+    case 0x000C:
+    case 0x000D:
+    case 0x001C:
+    case 0x001D:
+    case 0x001E:
+    case 0x001F:
+    case 0x0020:
+    case 0x0085:
+    case 0x00A0:
+    case 0x1680:
+    case 0x2000:
+    case 0x2001:
+    case 0x2002:
+    case 0x2003:
+    case 0x2004:
+    case 0x2005:
+    case 0x2006:
+    case 0x2007:
+    case 0x2008:
+    case 0x2009:
+    case 0x200A:
+    case 0x2028:
+    case 0x2029:
+    case 0x202F:
+    case 0x205F:
+    case 0x3000:
+      return true;
+  }
+  return false;
+}
+
+inline bool IsEmptyUString(const ustring& str) {
+  return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); });
+}
 
 class SpecialTokenMap {
  public:
diff --git a/operators/tokenizer/clip_tokenizer.cc b/operators/tokenizer/clip_tokenizer.cc
index e123a62f2..434daf901 100644
--- a/operators/tokenizer/clip_tokenizer.cc
+++ b/operators/tokenizer/clip_tokenizer.cc
@@ -18,51 +18,7 @@
 #include <codecvt>
 #include <mutex>
 
-#include "nlohmann/json.hpp"
-#include "bpetokenizer.hpp"
-#include "string_tensor.h"
-#include "unicode.h"
-
-// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)
-bool IsInUnicodeSpace(char32_t ch) {
-  switch (ch) {
-    case 0x0009:
-    case 0x000A:
-    case 0x000B:
-    case 0x000C:
-    case 0x000D:
-    case 0x001C:
-    case 0x001D:
-    case 0x001E:
-    case 0x001F:
-    case 0x0020:
-    case 0x0085:
-    case 0x00A0:
-    case 0x1680:
-    case 0x2000:
-    case 0x2001:
-    case 0x2002:
-    case 0x2003:
-    case 0x2004:
-    case 0x2005:
-    case 0x2006:
-    case 0x2007:
-    case 0x2008:
-    case 0x2009:
-    case 0x200A:
-    case 0x2028:
-    case 0x2029:
-    case 0x202F:
-    case 0x205F:
-    case 0x3000:
-      return true;
-  }
-  return false;
-}
-
-bool IsEmptyUstring(const ustring& str) {
-  return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsInUnicodeSpace(ch); });
-}
+#include "clip_tokenizer.hpp"
 
 KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info)
     : BaseKernel(api, info) {
@@ -93,7 +49,7 @@ KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKerne
 std::vector<int64_t> KernelClipBpeTokenizer::Tokenize(ustring& input, int64_t max_length) {
   std::vector<int64_t> res;
 
-  if (IsEmptyUstring(input)) {
+  if (IsEmptyUString(input)) {
     return res;
   }
   // Add <|startoftext|> token to result
diff --git a/operators/tokenizer/clip_tokenizer.hpp b/operators/tokenizer/clip_tokenizer.hpp
index 8489e83a6..8cf4b0c58 100644
--- a/operators/tokenizer/clip_tokenizer.hpp
+++ b/operators/tokenizer/clip_tokenizer.hpp
@@ -3,7 +3,7 @@
 #include "ustring.h"
 #include "string_utils.h"
 
-class VocabData;
+#include "bpetokenizer.hpp"
 
 struct KernelClipBpeTokenizer : BaseKernel {
   KernelClipBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info);
diff --git a/operators/tokenizer/gpt2_tokenizer.cc b/operators/tokenizer/gpt2_tokenizer.cc
index 5aba4a477..2f0501b76 100644
--- a/operators/tokenizer/gpt2_tokenizer.cc
+++ b/operators/tokenizer/gpt2_tokenizer.cc
@@ -17,51 +17,8 @@
 #include <codecvt>
 #include <mutex>
 
-#include "nlohmann/json.hpp"
-#include "bpetokenizer.hpp"
-#include "string_tensor.h"
-#include "unicode.h"
-
-// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)
-bool IsUnicodeSpace(char32_t ch) {
-  switch (ch) {
-    case 0x0009:
-    case 0x000A:
-    case 0x000B:
-    case 0x000C:
-    case 0x000D:
-    case 0x001C:
-    case 0x001D:
-    case 0x001E:
-    case 0x001F:
-    case 0x0020:
-    case 0x0085:
-    case 0x00A0:
-    case 0x1680:
-    case 0x2000:
-    case 0x2001:
-    case 0x2002:
-    case 0x2003:
-    case 0x2004:
-    case 0x2005:
-    case 0x2006:
-    case 0x2007:
-    case 0x2008:
-    case 0x2009:
-    case 0x200A:
-    case 0x2028:
-    case 0x2029:
-    case 0x202F:
-    case 0x205F:
-    case 0x3000:
-      return true;
-  }
-  return false;
-}
+#include "gpt2_tokenizer.hpp"
 
-bool IsEmptyUString(const ustring& str) {
-  return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); });
-}
 
 KernelBpeTokenizer::KernelBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info)
     : BaseKernel(api, info) {
diff --git a/operators/tokenizer/gpt2_tokenizer.hpp b/operators/tokenizer/gpt2_tokenizer.hpp
index ed1625d98..1eeb40246 100644
--- a/operators/tokenizer/gpt2_tokenizer.hpp
+++ b/operators/tokenizer/gpt2_tokenizer.hpp
@@ -1,8 +1,7 @@
 #include <list>
 #include "ocos.h"
 #include "ustring.h"
-
-class VocabData;
+#include "bpetokenizer.hpp"
 
 struct KernelBpeTokenizer : BaseKernel {
   KernelBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info);
diff --git a/operators/tokenizer/roberta_tokenizer.cc b/operators/tokenizer/roberta_tokenizer.cc
index 4737c054e..eb1bc8f81 100644
--- a/operators/tokenizer/roberta_tokenizer.cc
+++ b/operators/tokenizer/roberta_tokenizer.cc
@@ -18,51 +18,7 @@
 #include <codecvt>
 #include <mutex>
 
-#include "nlohmann/json.hpp"
-#include "bpetokenizer.hpp"
-#include "string_tensor.h"
-#include "unicode.h"
-
-// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)
-bool IsWithinUnicodeSpace(char32_t ch) {
-  switch (ch) {
-    case 0x0009:
-    case 0x000A:
-    case 0x000B:
-    case 0x000C:
-    case 0x000D:
-    case 0x001C:
-    case 0x001D:
-    case 0x001E:
-    case 0x001F:
-    case 0x0020:
-    case 0x0085:
-    case 0x00A0:
-    case 0x1680:
-    case 0x2000:
-    case 0x2001:
-    case 0x2002:
-    case 0x2003:
-    case 0x2004:
-    case 0x2005:
-    case 0x2006:
-    case 0x2007:
-    case 0x2008:
-    case 0x2009:
-    case 0x200A:
-    case 0x2028:
-    case 0x2029:
-    case 0x202F:
-    case 0x205F:
-    case 0x3000:
-      return true;
-  }
-  return false;
-}
-
-bool IsEmptyuString(const ustring& str) {
-  return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsWithinUnicodeSpace(ch); });
-}
+#include "roberta_tokenizer.hpp"
 
 KernelRobertaBpeTokenizer::KernelRobertaBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info)
     : BaseKernel(api, info) {
@@ -93,7 +49,7 @@ KernelRobertaBpeTokenizer::KernelRobertaBpeTokenizer(const OrtApi& api, const Or
 std::vector<int64_t> KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t max_length, std::list<std::list<std::pair<int, int>>>& offset_map) {
   std::vector<int64_t> res;
 
-  if (IsEmptyuString(input)) {
+  if (IsEmptyUString(input)) {
     return res;
   }
   // Add BOS token to result
diff --git a/operators/tokenizer/roberta_tokenizer.hpp b/operators/tokenizer/roberta_tokenizer.hpp
index e0252a304..6ab4fecbe 100644
--- a/operators/tokenizer/roberta_tokenizer.hpp
+++ b/operators/tokenizer/roberta_tokenizer.hpp
@@ -2,8 +2,7 @@
 #include "ocos.h"
 #include "ustring.h"
 #include "string_utils.h"
-
-class VocabData;
+#include "bpetokenizer.hpp"
 
 struct KernelRobertaBpeTokenizer : BaseKernel {
   KernelRobertaBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info);

From 371b6e6735d83666c2b2567de825f448f46c9e6c Mon Sep 17 00:00:00 2001
From: Wenbing Li <wenbingl@outlook.com>
Date: Mon, 27 Feb 2023 21:13:53 -0800
Subject: [PATCH 2/8] code cleanup and the warnings fixing.

---
 .gitignore                                    |  2 +-
 .pipelines/ci.yml                             | 15 ++++++++
 base/string_tensor.cc                         |  2 +-
 base/string_tensor.h                          |  2 +-
 base/ustring.cc                               |  3 +-
 base/ustring.h                                |  7 +---
 includes/ocos.h                               |  2 +
 operators/tokenizer/basic_tokenizer.cc        |  3 +-
 operators/tokenizer/bert_tokenizer.hpp        |  2 -
 .../tokenizer/bert_tokenizer_decoder.hpp      |  2 -
 operators/tokenizer/bpetokenizer.hpp          | 16 ++------
 operators/tokenizer/clip_tokenizer.cc         | 18 +--------
 operators/tokenizer/clip_tokenizer.hpp        |  7 ++--
 operators/tokenizer/gpt2_tokenizer.cc         | 29 --------------
 operators/tokenizer/gpt2_tokenizer.hpp        |  7 ++--
 operators/tokenizer/roberta_tokenizer.cc      | 38 ++++++-------------
 operators/tokenizer/roberta_tokenizer.hpp     | 11 +++---
 operators/tokenizer/wordpiece_tokenizer.hpp   |  2 -
 test/test_cliptok.py                          |  4 +-
 test/test_gpt2tok.py                          |  6 ---
 test/test_robertatok.py                       |  4 +-
 21 files changed, 59 insertions(+), 123 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2377dfe4f..0905bbd98 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,7 +38,7 @@ out/
 .scb/
 onnxruntime_extensions/_version.py
 onnxruntime-*-*-*/
-temp_*.onnx
+temp_*onnx*
 # Java specific ignores
 */.gradle
 java/hs_*.log
diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml
index a14653aa3..37516ad28 100644
--- a/.pipelines/ci.yml
+++ b/.pipelines/ci.yml
@@ -478,3 +478,18 @@ jobs:
             echo "Exception propogation was not enabled correctly."
             exit 1
           fi
+
+
+  ##############################
+  # Linux for selected_ops build
+  ##############################
+  - job: Linux
+    pool:
+      vmImage: 'ubuntu-latest'
+
+    steps:
+      # compiled as only one operator selected.
+      - script: |
+        set -e -x -u
+        echo 'set (OCOS_ENABLE_BERT_TOKENIZER ON CACHE BOOL "" FORCE)' > cmake/_selectedoplist.cmake
+        ./build.sh -DOCOS_ENABLE_CPP_EXCEPTIONS=OFF -DOCOS_ENABLE_SELECTED_OPLIST=ON
diff --git a/base/string_tensor.cc b/base/string_tensor.cc
index 9e832c1c5..569a39e7b 100644
--- a/base/string_tensor.cc
+++ b/base/string_tensor.cc
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+#include "string_tensor.h"
 #include "string_utils.h"
 #include "ustring.h"
-#include "string_tensor.h"
 
 void GetTensorMutableDataString(const OrtApi& api, OrtW::CustomOpApi& ort, OrtKernelContext* context,
                                 const OrtValue* value, std::vector<std::string>& output) {
diff --git a/base/string_tensor.h b/base/string_tensor.h
index 6b99fe1c0..469db8da8 100644
--- a/base/string_tensor.h
+++ b/base/string_tensor.h
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <string>
 #include "ocos.h"
+#include <string>
 
 
 // Retrieves a vector of strings if the input type is std::string.
diff --git a/base/ustring.cc b/base/ustring.cc
index 1a06392bf..9ac9a8eb0 100644
--- a/base/ustring.cc
+++ b/base/ustring.cc
@@ -1,7 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-#include <iostream>
 #include "ustring.h"
+#include <iostream>
+
 
 ustring::ustring() : std::u32string() {
 }
diff --git a/base/ustring.h b/base/ustring.h
index cf817df56..e20a21053 100644
--- a/base/ustring.h
+++ b/base/ustring.h
@@ -1,14 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #pragma once
-#include <string>
-#include <locale>
-#include <functional>
-
-#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING 1
-#include <codecvt>
 
 #include "ocos.h"
+#include <codecvt>
 
 // ustring needs a new implementation, due to the std::codecvt deprecation.
 // Wrap u32string with ustring, in case we will use other implementation in the future
diff --git a/includes/ocos.h b/includes/ocos.h
index b61d1c585..32be5a69a 100644
--- a/includes/ocos.h
+++ b/includes/ocos.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#include <string>
 #include <algorithm>
 #include <functional>
 #include <iterator>
diff --git a/operators/tokenizer/basic_tokenizer.cc b/operators/tokenizer/basic_tokenizer.cc
index d57ae1d4b..ce87c67e7 100644
--- a/operators/tokenizer/basic_tokenizer.cc
+++ b/operators/tokenizer/basic_tokenizer.cc
@@ -1,12 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "string_utils.h"
 #include "basic_tokenizer.hpp"
+#include "string_utils.h"
 #include "string_tensor.h"
 #include <vector>
 #include <locale>
-#include <codecvt>
 #include <algorithm>
 
 BasicTokenizer::BasicTokenizer(bool do_lower_case, bool tokenize_chinese_chars, bool strip_accents,
diff --git a/operators/tokenizer/bert_tokenizer.hpp b/operators/tokenizer/bert_tokenizer.hpp
index 7b3f320b7..bd78d2635 100644
--- a/operators/tokenizer/bert_tokenizer.hpp
+++ b/operators/tokenizer/bert_tokenizer.hpp
@@ -3,8 +3,6 @@
 
 #pragma once
 
-#include <unordered_map>
-#include <vector>
 #include "ocos.h"
 #include "ustring.h"
 #include "string_utils.h"
diff --git a/operators/tokenizer/bert_tokenizer_decoder.hpp b/operators/tokenizer/bert_tokenizer_decoder.hpp
index a0863d434..c1e20b961 100644
--- a/operators/tokenizer/bert_tokenizer_decoder.hpp
+++ b/operators/tokenizer/bert_tokenizer_decoder.hpp
@@ -3,8 +3,6 @@
 
 #pragma once
 
-#include <unordered_map>
-#include <vector>
 #include "ocos.h"
 #include "ustring.h"
 #include "string_utils.h"
diff --git a/operators/tokenizer/bpetokenizer.hpp b/operators/tokenizer/bpetokenizer.hpp
index 0b7d093b7..5907c5615 100644
--- a/operators/tokenizer/bpetokenizer.hpp
+++ b/operators/tokenizer/bpetokenizer.hpp
@@ -1,24 +1,15 @@
 // Licensed under the MIT License.
 // Partial code comes from other Microsoft employee.
 #pragma once
-#include <string>
-#include <vector>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <algorithm>
-#include <list>
-#include <memory>
+#include "ustring.h"
+
 #include <regex>
-#include <sstream>
-#include <stdexcept>
 #include <unordered_map>
 #include <functional>
-#include <codecvt>
-#include <mutex>
 
 #include "unicode.h"
 #include "nlohmann/json.hpp"
+#include "string_utils.h"
 #include "string_tensor.h"
 
 // Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)
@@ -155,7 +146,6 @@ class VocabData {
     } else {
       int id = static_cast<int>(vocab_map_.size());
       vocab_map_[unk_token] = id;
-      std::cerr << "Special token (" << unk_token << ") have been added in the vocabulary." << std::endl;
     }
 
     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> str_convert;
diff --git a/operators/tokenizer/clip_tokenizer.cc b/operators/tokenizer/clip_tokenizer.cc
index 434daf901..1f565c820 100644
--- a/operators/tokenizer/clip_tokenizer.cc
+++ b/operators/tokenizer/clip_tokenizer.cc
@@ -1,24 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 // Partial code comes from other Microsoft employee.
-
-#include <string>
-#include <vector>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <algorithm>
-#include <list>
-#include <memory>
-#include <regex>
-#include <sstream>
-#include <stdexcept>
-#include <unordered_map>
-#include <functional>
-#include <codecvt>
-#include <mutex>
-
 #include "clip_tokenizer.hpp"
+#include "string_utils.h"
 
 KernelClipBpeTokenizer::KernelClipBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info)
     : BaseKernel(api, info) {
diff --git a/operators/tokenizer/clip_tokenizer.hpp b/operators/tokenizer/clip_tokenizer.hpp
index 8cf4b0c58..d5387919c 100644
--- a/operators/tokenizer/clip_tokenizer.hpp
+++ b/operators/tokenizer/clip_tokenizer.hpp
@@ -1,8 +1,7 @@
-#include <list>
-#include "ocos.h"
-#include "ustring.h"
-#include "string_utils.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
 
+#pragma once
 #include "bpetokenizer.hpp"
 
 struct KernelClipBpeTokenizer : BaseKernel {
diff --git a/operators/tokenizer/gpt2_tokenizer.cc b/operators/tokenizer/gpt2_tokenizer.cc
index 2f0501b76..2a3234c57 100644
--- a/operators/tokenizer/gpt2_tokenizer.cc
+++ b/operators/tokenizer/gpt2_tokenizer.cc
@@ -2,21 +2,6 @@
 // Licensed under the MIT License.
 // Partial code comes from other Microsoft employee.
 
-#include <string>
-#include <vector>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <list>
-#include <memory>
-#include <regex>
-#include <sstream>
-#include <stdexcept>
-#include <unordered_map>
-#include <functional>
-#include <codecvt>
-#include <mutex>
-
 #include "gpt2_tokenizer.hpp"
 
 
@@ -157,17 +142,3 @@ size_t CustomOpBpeTokenizer::GetOutputTypeCount() const {
 ONNXTensorElementDataType CustomOpBpeTokenizer::GetOutputType(size_t /*index*/) const {
   return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
 }
-
-const OrtCustomOp** LoadTokenizerSchemaList() {
-  // create the global objects here to let the ORT catch the expection if any
-  static std::unique_ptr<CustomOpBpeTokenizer> p_CoBpeTokenizer;
-  static const OrtCustomOp* c_CustomOpList[2] = {nullptr};  // {&c_CoBpeTokenizer, nullptr};
-  static std::mutex mtx_loaded;
-  std::lock_guard<std::mutex> lck(mtx_loaded);
-  if (p_CoBpeTokenizer.get() == nullptr) {
-    p_CoBpeTokenizer = std::make_unique<CustomOpBpeTokenizer>();
-    c_CustomOpList[0] = p_CoBpeTokenizer.get();
-  }
-
-  return c_CustomOpList;
-}
diff --git a/operators/tokenizer/gpt2_tokenizer.hpp b/operators/tokenizer/gpt2_tokenizer.hpp
index 1eeb40246..31b2bd2d2 100644
--- a/operators/tokenizer/gpt2_tokenizer.hpp
+++ b/operators/tokenizer/gpt2_tokenizer.hpp
@@ -1,6 +1,7 @@
-#include <list>
-#include "ocos.h"
-#include "ustring.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
 #include "bpetokenizer.hpp"
 
 struct KernelBpeTokenizer : BaseKernel {
diff --git a/operators/tokenizer/roberta_tokenizer.cc b/operators/tokenizer/roberta_tokenizer.cc
index eb1bc8f81..c886ce7e7 100644
--- a/operators/tokenizer/roberta_tokenizer.cc
+++ b/operators/tokenizer/roberta_tokenizer.cc
@@ -2,23 +2,9 @@
 // Licensed under the MIT License.
 // Partial code comes from other Microsoft employee.
 
-#include <string>
-#include <vector>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <algorithm>
-#include <list>
-#include <memory>
-#include <regex>
-#include <sstream>
-#include <stdexcept>
-#include <unordered_map>
-#include <functional>
-#include <codecvt>
-#include <mutex>
-
 #include "roberta_tokenizer.hpp"
+#include "narrow.h"
+
 
 KernelRobertaBpeTokenizer::KernelRobertaBpeTokenizer(const OrtApi& api, const OrtKernelInfo& info)
     : BaseKernel(api, info) {
@@ -46,7 +32,7 @@ KernelRobertaBpeTokenizer::KernelRobertaBpeTokenizer(const OrtApi& api, const Or
   bbpe_tokenizer_->Load(vocabu_stream, merges_stream, "<|endoftext|>", "<|endoftext|>");
 }
 
-std::vector<int64_t> KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t max_length, std::list<std::list<std::pair<int, int>>>& offset_map) {
+std::vector<int64_t> KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t max_length, std::list<OffsetMappingType>& offset_map) {
   std::vector<int64_t> res;
 
   if (IsEmptyUString(input)) {
@@ -72,8 +58,8 @@ std::vector<int64_t> KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t
     const char32_t* ptr = cur_input.c_str();
     regcmp.Set(ptr);
 
-    int offset = 0;
-    std::list<std::pair<int, int>> offset_mapping;
+    size_t offset = 0;
+    OffsetMappingType offset_mapping;
 
     // Add offset mapping for BOS token
     offset_mapping.push_back(std::make_pair(0, 0));
@@ -86,16 +72,16 @@ std::vector<int64_t> KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t
 
       // Handle offset mapping and special cases
       if (utf8_token.at(0) == ' ') {
-        offset_mapping.push_back(std::make_pair(offset + 1, offset + utf8_token.size()));
+        offset_mapping.emplace_back(std::make_pair(offset + 1, ort_extensions::narrow<size_t>(offset + utf8_token.size())));
       } else {
-        offset_mapping.push_back(std::make_pair(offset, offset + utf8_token.size()));
+        offset_mapping.emplace_back(std::make_pair(offset, ort_extensions::narrow<size_t>(offset + utf8_token.size())));
       }
       offset += utf8_token.size();
 
       // Get byte encodings prior to performing BPE
       byte_list_.clear();
       for (char& cp : utf8_token) {
-        byte_list_.push_back(bbpe_tokenizer_->ByteEncoder()[static_cast<unsigned char>(cp)]);
+        byte_list_.emplace_back(bbpe_tokenizer_->ByteEncoder()[static_cast<unsigned char>(cp)]);
       }
 
       // Perform BPE
@@ -111,13 +97,13 @@ std::vector<int64_t> KernelRobertaBpeTokenizer::Tokenize(ustring& input, int64_t
       }
     }
     // Add offset mapping for EOS token
-    offset_mapping.push_back(std::make_pair(0, 0));
+    offset_mapping.emplace_back(std::make_pair(0, 0));
 
     // Add offset mappings for input in this instance to list of offset mappings for all inputs
-    offset_map.push_back(offset_mapping);
+    offset_map.emplace_back(offset_mapping);
   }
   // Add EOS token to result
-  res.push_back(bbpe_tokenizer_->GetEncoding("</s>"));
+  res.emplace_back(bbpe_tokenizer_->GetEncoding("</s>"));
   return res;
 }
 
@@ -125,7 +111,7 @@ void KernelRobertaBpeTokenizer::Compute(OrtKernelContext* context) {
   // Setup inputs
   const OrtValue* input = ort_.KernelContext_GetInput(context, 0);
   std::vector<std::string> str_input;
-  std::list<std::list<std::pair<int, int>>> offset_map;
+  std::list<OffsetMappingType> offset_map;
   GetTensorMutableDataString(api_, ort_, context, input, str_input);
   OrtTensorDimensions input_dim(ort_, input);
 
diff --git a/operators/tokenizer/roberta_tokenizer.hpp b/operators/tokenizer/roberta_tokenizer.hpp
index 6ab4fecbe..b499b6866 100644
--- a/operators/tokenizer/roberta_tokenizer.hpp
+++ b/operators/tokenizer/roberta_tokenizer.hpp
@@ -1,7 +1,7 @@
-#include <list>
-#include "ocos.h"
-#include "ustring.h"
-#include "string_utils.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
 #include "bpetokenizer.hpp"
 
 struct KernelRobertaBpeTokenizer : BaseKernel {
@@ -9,7 +9,8 @@ struct KernelRobertaBpeTokenizer : BaseKernel {
   void Compute(OrtKernelContext* context);
 
  private:
-  std::vector<int64_t> Tokenize(ustring& input, int64_t max_length, std::list<std::list<std::pair<int, int>>>& offset_map);
+  using OffsetMappingType = std::list<std::pair<size_t, size_t>>;
+  std::vector<int64_t> Tokenize(ustring& input, int64_t max_length, std::list<OffsetMappingType>& offset_map);
 
   int64_t padding_length_;
   std::list<int> byte_list_;
diff --git a/operators/tokenizer/wordpiece_tokenizer.hpp b/operators/tokenizer/wordpiece_tokenizer.hpp
index 1bdb8d893..298a5b197 100644
--- a/operators/tokenizer/wordpiece_tokenizer.hpp
+++ b/operators/tokenizer/wordpiece_tokenizer.hpp
@@ -3,8 +3,6 @@
 
 #pragma once
 
-#include <unordered_map>
-#include <vector>
 #include "ocos.h"
 #include "ustring.h"
 #include "string_utils.h"
diff --git a/test/test_cliptok.py b/test/test_cliptok.py
index ce8fc49c2..028eecd56 100644
--- a/test/test_cliptok.py
+++ b/test/test_cliptok.py
@@ -39,7 +39,9 @@ class TestCLIPTokenizer(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-        files = cls.tokenizer.save_vocabulary(".")
+        temp_dir = Path('./temp_onnxclip')
+        temp_dir.mkdir(parents=True, exist_ok=True)
+        files = cls.tokenizer.save_vocabulary(str(temp_dir))
         cls.tokjson = files[0]
         cls.merges = files[1]
 
diff --git a/test/test_gpt2tok.py b/test/test_gpt2tok.py
index 2fae5988c..6b1988c68 100644
--- a/test/test_gpt2tok.py
+++ b/test/test_gpt2tok.py
@@ -2,7 +2,6 @@
 import numpy as np
 import onnxruntime as _ort
 
-from pathlib import Path
 from onnx import helper, onnx_pb as onnx_proto
 from transformers import GPT2Tokenizer
 from onnxruntime_extensions import (
@@ -17,11 +16,6 @@ def _get_file_content(path):
         return file.read()
 
 
-def _get_test_data_file(*sub_dirs):
-    test_dir = Path(__file__).parent
-    return str(test_dir.joinpath(*sub_dirs))
-
-
 def _create_test_model(**kwargs):
     vocab_file = kwargs["vocab_file"]
     merges_file = kwargs["merges_file"]
diff --git a/test/test_robertatok.py b/test/test_robertatok.py
index 053a23ac6..4eb2681e1 100644
--- a/test/test_robertatok.py
+++ b/test/test_robertatok.py
@@ -42,7 +42,9 @@ class TestRobertaTokenizer(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
-        files = cls.tokenizer.save_vocabulary(".")
+        temp_dir = Path('./temp_onnxroberta')
+        temp_dir.mkdir(parents=True, exist_ok=True)
+        files = cls.tokenizer.save_vocabulary(str(temp_dir))
         cls.tokjson = files[0]
         cls.merges = files[1]
 

From 7b2916987d3cecf1f46780ee22837cd4b8282e3e Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Mon, 27 Feb 2023 22:38:30 -0800
Subject: [PATCH 3/8] Update ci.yml for Azure Pipelines

---
 .pipelines/ci.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml
index 37516ad28..0f2f4cde1 100644
--- a/.pipelines/ci.yml
+++ b/.pipelines/ci.yml
@@ -489,7 +489,8 @@ jobs:
 
     steps:
       # compiled as only one operator selected.
-      - script: |
-        set -e -x -u
-        echo 'set (OCOS_ENABLE_BERT_TOKENIZER ON CACHE BOOL "" FORCE)' > cmake/_selectedoplist.cmake
-        ./build.sh -DOCOS_ENABLE_CPP_EXCEPTIONS=OFF -DOCOS_ENABLE_SELECTED_OPLIST=ON
+      - bash: |
+          set -e -x -u
+          echo 'set (OCOS_ENABLE_BERT_TOKENIZER ON CACHE BOOL "" FORCE)' > cmake/_selectedoplist.cmake
+          ./build.sh -DOCOS_ENABLE_CPP_EXCEPTIONS=OFF -DOCOS_ENABLE_SELECTED_OPLIST=ON
+        displayName: Build ort-extensions with only one operator was selected

From 0bf443036d9a008ffc18ec32c155549a5f6dc562 Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Mon, 27 Feb 2023 22:40:31 -0800
Subject: [PATCH 4/8] Update ci.yml for Azure Pipelines

---
 .pipelines/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pipelines/ci.yml b/.pipelines/ci.yml
index 0f2f4cde1..5a50296a6 100644
--- a/.pipelines/ci.yml
+++ b/.pipelines/ci.yml
@@ -483,7 +483,7 @@ jobs:
   ##############################
   # Linux for selected_ops build
   ##############################
-  - job: Linux
+  - job: Linux_SelectedOpsBuild
     pool:
       vmImage: 'ubuntu-latest'
 

From 6ff1bb20043ba9c9507872c5f0ae96802ea7ace6 Mon Sep 17 00:00:00 2001
From: Wenbing Li <wenbingl@outlook.com>
Date: Tue, 28 Feb 2023 10:10:02 -0800
Subject: [PATCH 5/8] fix linux build

---
 base/ustring.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/base/ustring.h b/base/ustring.h
index e20a21053..e20f90210 100644
--- a/base/ustring.h
+++ b/base/ustring.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "ocos.h"
+#include <locale>
 #include <codecvt>
 
 // ustring needs a new implementation, due to the std::codecvt deprecation.

From 7d3c87696e9d2586ed63622ca0f26b0c0a3ec02f Mon Sep 17 00:00:00 2001
From: Wenbing Li <wenbingl@outlook.com>
Date: Tue, 28 Feb 2023 11:05:39 -0800
Subject: [PATCH 6/8] one more fixing

---
 operators/tokenizer/bpetokenizer.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/operators/tokenizer/bpetokenizer.hpp b/operators/tokenizer/bpetokenizer.hpp
index 5907c5615..fcc8e1f63 100644
--- a/operators/tokenizer/bpetokenizer.hpp
+++ b/operators/tokenizer/bpetokenizer.hpp
@@ -1,11 +1,12 @@
 // Licensed under the MIT License.
 // Partial code comes from other Microsoft employee.
 #pragma once
+#include "ocos.h"
 #include "ustring.h"
 
 #include <regex>
+#include <list>
 #include <unordered_map>
-#include <functional>
 
 #include "unicode.h"
 #include "nlohmann/json.hpp"

From 49fbf414cdc2042d065c02b52e9897046a3effa5 Mon Sep 17 00:00:00 2001
From: Wenbing Li <wenbingl@outlook.com>
Date: Tue, 28 Feb 2023 11:22:04 -0800
Subject: [PATCH 7/8] again?

---
 operators/tokenizer/wordpiece_tokenizer.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/operators/tokenizer/wordpiece_tokenizer.hpp b/operators/tokenizer/wordpiece_tokenizer.hpp
index 298a5b197..a34b82cdb 100644
--- a/operators/tokenizer/wordpiece_tokenizer.hpp
+++ b/operators/tokenizer/wordpiece_tokenizer.hpp
@@ -8,6 +8,9 @@
 #include "string_utils.h"
 #include "string_tensor.h"
 
+#include <unordered_map>
+
+
 struct KernelWordpieceTokenizer : BaseKernel {
   KernelWordpieceTokenizer(const OrtApi& api, const OrtKernelInfo& info);
   void Compute(OrtKernelContext* context);

From 37ee047394420f06e13a5ca5fc2d40d30c217afd Mon Sep 17 00:00:00 2001
From: Wenbing Li <wenli@microsoft.com>
Date: Tue, 28 Feb 2023 14:04:10 -0800
Subject: [PATCH 8/8] fixing for macOS

---
 operators/tokenizer/bert_tokenizer.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/operators/tokenizer/bert_tokenizer.hpp b/operators/tokenizer/bert_tokenizer.hpp
index bd78d2635..4faddd067 100644
--- a/operators/tokenizer/bert_tokenizer.hpp
+++ b/operators/tokenizer/bert_tokenizer.hpp
@@ -9,6 +9,9 @@
 #include "string_tensor.h"
 #include "basic_tokenizer.hpp"
 
+#include <unordered_map>
+
+
 class BertTokenizerVocab final {
  public:
   explicit BertTokenizerVocab(std::string_view vocab);