[tokenizer] Fixes import model zoo bug (#3103)

deepjavalibrary · Apr 23, 2024 · 357b9b4 · 357b9b4
1 parent ec7d6fd
commit 357b9b4
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 15 deletions.
diff --git a/extensions/tokenizers/src/main/python/huggingface_converter.py b/extensions/tokenizers/src/main/python/huggingface_converter.py
@@ -56,8 +56,7 @@ def save_onnx_model(self, model_info, args: Namespace, temp_dir: str):
         if not os.path.exists(temp_dir):
             os.makedirs(temp_dir)
 
-        model_name = model_id.split("/")[-1]
-        logging.info(f"Saving onnxruntime model: {model_name}.onnx ...")
+        logging.info(f"Saving onnxruntime model: {model_id} ...")
 
         from optimum.commands.optimum_cli import main
 
@@ -72,7 +71,7 @@ def save_onnx_model(self, model_info, args: Namespace, temp_dir: str):
         include_types = "token_type_id" in inputs
 
         tokenizer = AutoTokenizer.from_pretrained(model_id)
-        hf_pipeline = PipelineHolder(model, tokenizer)
+        hf_pipeline = PipelineHolder(tokenizer, model)
         size = self.save_to_model_zoo(model_info, args.output_dir,
                                       "OnnxRuntime", temp_dir, hf_pipeline,
                                       include_types)

diff --git a/extensions/tokenizers/src/main/python/huggingface_models.py b/extensions/tokenizers/src/main/python/huggingface_models.py
@@ -99,7 +99,8 @@ def list_models(self, args: Namespace) -> List[dict]:
             existing_model = self.processed_models.get(model_id)
             if existing_model:
                 existing_model["downloads"] = model_info.downloads
-                if not args.retry_failed:
+                if not args.retry_failed or existing_model[
+                        "result"] == "success":
                     logging.info(f"Skip converted model: {model_id}.")
                     continue
 
@@ -119,15 +120,22 @@ def list_models(self, args: Namespace) -> List[dict]:
             with open(config) as f:
                 config = json.load(f)
 
-            if "sentence-similarity" in model_info.tags:
-                task = "sentence-similarity"
-            else:
-                task, architecture = self.to_supported_task(config)
-                if not task:
-                    logging.info(
-                        f"Unsupported model architecture: {architecture} for {model_id}."
-                    )
-                    continue
+            task, architecture = self.to_supported_task(config)
+            if not task:
+                if "sentence-similarity" in model_info.tags:
+                    task = "sentence-similarity"
+
+            if not task:
+                logging.info(
+                    f"Unsupported model architecture: {architecture} for {model_id}."
+                )
+                continue
+
+            if args.category and args.category != task:
+                logging.info(
+                    f"Skip {model_id}, expect task: {args.category}, detected {task}."
+                )
+                continue
 
             model = {
                 "model_info": model_info,

diff --git a/extensions/tokenizers/src/main/python/sentence_similarity_converter.py b/extensions/tokenizers/src/main/python/sentence_similarity_converter.py
@@ -17,7 +17,7 @@
 
 import requests
 import torch
-from transformers import AutoTokenizer, AutoModel
+from transformers import AutoTokenizer, AutoModel, AutoConfig
 
 from huggingface_converter import HuggingfaceConverter, PipelineHolder
 from huggingface_hub import hf_hub_download
@@ -76,7 +76,10 @@ def get_extra_arguments(self, hf_pipeline, model_id: str,
                 pass
 
         if not "maxLength" in args:
-            config = hf_pipeline.model.config
+            if hasattr(hf_pipeline.model, "config"):
+                config = hf_pipeline.model.config
+            else:
+                config = AutoConfig.from_pretrained(model_id)
             tokenizer = hf_pipeline.tokenizer
             if hasattr(config, "max_position_embeddings") and hasattr(
                     tokenizer, "model_max_length"):