From 62ac69c338ee3a590fa58c9028697d006a55112d Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Thu, 31 Oct 2024 14:39:54 +0100
Subject: [PATCH] Moved function

---
 model2vec/hf_utils.py     | 61 +--------------------------------
 scripts/export_to_onnx.py | 71 +++++++++++++++++++++++++++++++++------
 2 files changed, 62 insertions(+), 70 deletions(-)

diff --git a/model2vec/hf_utils.py b/model2vec/hf_utils.py
index 1f46801..0fa3514 100644
--- a/model2vec/hf_utils.py
+++ b/model2vec/hf_utils.py
@@ -18,9 +18,6 @@
 logger = logging.getLogger(__name__)
 
 
-import json
-
-
 def save_pretrained(
     folder_path: Path,
     embeddings: np.ndarray,
@@ -42,63 +39,7 @@ def save_pretrained(
     folder_path.mkdir(exist_ok=True, parents=True)
     save_file({"embeddings": embeddings}, folder_path / "model.safetensors")
     tokenizer.save(str(folder_path / "tokenizer.json"))
-    with open(folder_path / "config.json", "w") as config_file:
-        json.dump(config, config_file, indent=4, sort_keys=True)
-
-    # Save vocab.txt
-    with open(folder_path / "vocab.txt", "w") as vocab_file:
-        vocab = tokenizer.get_vocab()
-        for token in sorted(vocab, key=vocab.get):
-            vocab_file.write(f"{token}\n")
-
-    # Load tokenizer.json to use for generating tokenizer_config.json
-    with open(folder_path / "tokenizer.json", "r") as f:
-        tokenizer_data = json.load(f)
-
-    # Save special_tokens_map.json
-    special_tokens = {
-        "cls_token": "[CLS]",
-        "sep_token": "[SEP]",
-        "pad_token": "[PAD]",
-        "unk_token": "[UNK]",
-        "mask_token": "[MASK]",
-    }
-    with open(folder_path / "special_tokens_map.json", "w") as special_tokens_file:
-        json.dump(special_tokens, special_tokens_file, indent=4, sort_keys=True)
-
-    # Set fallback values for normalizer attributes in case normalizer is None
-    normalizer = tokenizer_data.get("normalizer")
-    do_lower_case = normalizer.get("lowercase") if normalizer else config.get("do_lower_case", True)
-    strip_accents = normalizer.get("strip_accents") if normalizer else None
-    tokenize_chinese_chars = normalizer.get("handle_chinese_chars", True) if normalizer else True
-
-    # Save tokenizer_config.json based on tokenizer.json
-    tokenizer_config = {
-        "added_tokens_decoder": {
-            str(token["id"]): {
-                "content": token["content"],
-                "lstrip": token.get("lstrip", False),
-                "normalized": token.get("normalized", False),
-                "rstrip": token.get("rstrip", False),
-                "single_word": token.get("single_word", False),
-                "special": token.get("special", True),
-            }
-            for token in tokenizer_data.get("added_tokens", [])
-        },
-        "clean_up_tokenization_spaces": True,
-        "cls_token": special_tokens["cls_token"],
-        "do_lower_case": do_lower_case,
-        "mask_token": special_tokens["mask_token"],
-        "model_max_length": config.get("seq_length", 512),
-        "pad_token": special_tokens["pad_token"],
-        "sep_token": special_tokens["sep_token"],
-        "strip_accents": strip_accents,
-        "tokenize_chinese_chars": tokenize_chinese_chars,
-        "tokenizer_class": "BertTokenizer",
-        "unk_token": special_tokens["unk_token"],
-    }
-    with open(folder_path / "tokenizer_config.json", "w") as tokenizer_config_file:
-        json.dump(tokenizer_config, tokenizer_config_file, indent=4, sort_keys=True)
+    json.dump(config, open(folder_path / "config.json", "w"))
 
     logger.info(f"Saved model to {folder_path}")
 
diff --git a/scripts/export_to_onnx.py b/scripts/export_to_onnx.py
index cbedaed..40df038 100644
--- a/scripts/export_to_onnx.py
+++ b/scripts/export_to_onnx.py
@@ -12,6 +12,8 @@
 from pathlib import Path
 
 import torch
+from tokenizers import Tokenizer
+from transformers import PreTrainedTokenizerFast
 
 from model2vec import StaticModel
 
@@ -70,30 +72,40 @@ def tokenize(self, sentences: list[str], max_length: int | None = None) -> tuple
             encodings_ids = [token_ids[:max_length] for token_ids in encodings_ids]
         # Flatten input_ids and compute offsets
         offsets = torch.tensor([0] + [len(ids) for ids in encodings_ids[:-1]], dtype=torch.long).cumsum(dim=0)
-        input_ids = torch.tensor([token_id for token_ids in encodings_ids for token_id in token_ids], dtype=torch.long)
+        input_ids = torch.tensor(
+            [token_id for token_ids in encodings_ids for token_id in token_ids],
+            dtype=torch.long,
+        )
         return input_ids, offsets
 
 
-def export_model_to_onnx(model_path: str, save_path: str) -> None:
+def export_model_to_onnx(model_path: str, save_path: Path) -> None:
     """
-    Export the StaticModel to ONNX format.
+    Export the StaticModel to ONNX format and save tokenizer files.
 
     :param model_path: The path to the pretrained StaticModel.
-    :param save_path: The path to save the exported ONNX model
+    :param save_path: The directory to save the model and related files.
     """
-    # Convert the StaticModel to TorchStaticModel
+    save_path.mkdir(parents=True, exist_ok=True)
+
+    # Load the StaticModel
     model = StaticModel.from_pretrained(model_path)
     torch_model = TorchStaticModel(model)
 
+    # Save the model using save_pretrained
+    model.save_pretrained(save_path)
+
     # Prepare dummy input data
     texts = ["hello", "hello world"]
     input_ids, offsets = torch_model.tokenize(texts)
 
     # Export the model to ONNX
+    onnx_model_path = save_path / "onnx/model.onnx"
+    onnx_model_path.parent.mkdir(parents=True, exist_ok=True)
     torch.onnx.export(
         torch_model,
         (input_ids, offsets),
-        save_path,
+        str(onnx_model_path),
         export_params=True,
         opset_version=14,
         do_constant_folding=True,
@@ -106,13 +118,52 @@ def export_model_to_onnx(model_path: str, save_path: str) -> None:
         },
     )
 
-    logger.info(f"Model has been successfully exported to {save_path}")
+    logger.info(f"Model has been successfully exported to {onnx_model_path}")
+
+    # Save the tokenizer files required for transformers.js
+    save_tokenizer(model.tokenizer, save_path)
+    logger.info(f"Tokenizer files have been saved to {save_path}")
+
+
+def save_tokenizer(tokenizer: Tokenizer, save_directory: Path) -> None:
+    """
+    Save tokenizer files in a format compatible with Transformers.
+
+    :param tokenizer: The tokenizer from the StaticModel.
+    :param save_directory: The directory to save the tokenizer files.
+    """
+    # Convert the tokenizers.Tokenizer to a PreTrainedTokenizerFast and save
+    tokenizer_json_path = save_directory / "tokenizer.json"
+    tokenizer.save(str(tokenizer_json_path))
+
+    # Load the tokenizer using PreTrainedTokenizerFast
+    fast_tokenizer = PreTrainedTokenizerFast(
+        tokenizer_file=str(tokenizer_json_path),
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        sep_token="[SEP]",
+        mask_token="[MASK]",
+    )
+
+    # Save the tokenizer files
+    fast_tokenizer.save_pretrained(str(save_directory))
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Export StaticModel to ONNX format")
-    parser.add_argument("--model_path", type=Path, required=True, help="Path to the pretrained StaticModel")
-    parser.add_argument("--save_path", type=Path, required=True, help="Path to save the exported ONNX model")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="Path to the pretrained StaticModel",
+    )
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        required=True,
+        help="Directory to save the exported model and files",
+    )
     args = parser.parse_args()
 
-    export_model_to_onnx(args.model_path, args.save_path)
+    export_model_to_onnx(args.model_path, Path(args.save_path))