opensearch-project · Jebq · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 23, 2024
@@ -38,6 +38,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - use try-except-else block for handling unexpected exceptions during integration tests by @rawwar([#370](https://github.com/opensearch-project/opensearch-py-ml/pull/370))
 - Removed pandas version pin in nox tests by @rawwar ([#368](https://github.com/opensearch-project/opensearch-py-ml/pull/368))
 - Switch AL2 to AL2023 agent and DockerHub to ECR images in ml-models.JenkinsFile ([#377](https://github.com/opensearch-project/opensearch-py-ml/pull/377))
+- Fix to ensure `model_id` is only required once when saving a model in `sentencetransformermodel.py` in ([#381](https://github.com/opensearch-project/opensearch-py-ml/pull/381))
+- Add a `config_output_path` param in `make_model_config_json` in `sentencetransformermodel.py` to chose where the saving location of the model config json file. ([#381](https://github.com/opensearch-project/opensearch-py-ml/pull/381))
 
 ### Fixed
 - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))

@@ -419,9 +419,9 @@ def train_model(
         """
 
         if model_id is None:
-            model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
+            model_id = self.model_id
         if output_model_name is None:
-            output_model_name = str(self.model_id.split("/")[-1] + ".pt")
+            output_model_name = str(model_id.split("/")[-1] + ".pt")
 
         # declare variables before assignment for training
         corp_len = []
@@ -765,7 +765,7 @@ def _fill_null_truncation_field(
     def save_as_pt(
         self,
         sentences: [str],
-        model_id="sentence-transformers/msmarco-distilbert-base-tas-b",
+        model_id: str = None,
         model_name: str = None,
         save_json_folder_path: str = None,
         model_output_path: str = None,
@@ -780,7 +780,7 @@ def save_as_pt(
             Required, for example  sentences = ['today is sunny']
         :type sentences: List of string [str]
         :param model_id:
-            sentence transformer model id to download model from sentence transformers.
+            Optional, sentence transformer model id to download model from sentence transformers.
             default model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
         :type model_id: string
         :param model_name:
@@ -806,6 +806,9 @@ def save_as_pt(
         :rtype: string
         """
 
+        if model_id is None:
+            model_id = self.model_id
+
         model = SentenceTransformer(model_id)
 
         if model_name is None:
@@ -877,7 +880,7 @@ def save_as_pt(
 
     def save_as_onnx(
         self,
-        model_id="sentence-transformers/msmarco-distilbert-base-tas-b",
+        model_id: str = None,
         model_name: str = None,
         save_json_folder_path: str = None,
         model_output_path: str = None,
@@ -889,7 +892,7 @@ def save_as_onnx(
         zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster
 
         :param model_id:
-            sentence transformer model id to download model from sentence transformers.
+            Optional, sentence transformer model id to download model from sentence transformers.
             default model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
         :type model_id: string
         :param model_name:
@@ -915,6 +918,9 @@ def save_as_onnx(
         :rtype: string
         """
 
+        if model_id is None:
+            model_id = self.model_id
+
         model = SentenceTransformer(model_id)
 
         if model_name is None:
@@ -1143,6 +1149,7 @@ def make_model_config_json(
         model_name: str = None,
         version_number: str = 1,
         model_format: str = "TORCH_SCRIPT",
+        config_output_path: str = None,
         model_zip_file_path: str = None,
         embedding_dimension: int = None,
         pooling_mode: str = None,
@@ -1163,6 +1170,10 @@ def make_model_config_json(
         :param model_format:
             Optional, the format of the model. Default is "TORCH_SCRIPT".
         :type model_format: string
+        :param config_output_path:
+             Optional, path to save model config json file. If None, default as
+             default_folder_path from the constructor
+        :type config_output_path: string
         :param model_zip_file_path:
             Optional, path to the model zip file. Default is the zip file path used in save_as_pt or save_as_onnx
             depending on model_format. This zip file is used to compute model_content_size_in_bytes and
@@ -1198,6 +1209,8 @@ def make_model_config_json(
         :rtype: string
         """
         folder_path = self.folder_path
+        if config_output_path is None:
+            config_output_path = self.folder_path
         config_json_file_path = os.path.join(folder_path, "config.json")
         if model_name is None:
             model_name = self.model_id
@@ -1313,7 +1326,7 @@ def make_model_config_json(
             print(json.dumps(model_config_content, indent=4))
 
         model_config_file_path = os.path.join(
-            folder_path, "ml-commons_model_config.json"
+            config_output_path, "ml-commons_model_config.json"
         )
         os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True)
         with open(model_config_file_path, "w") as file:

@@ -251,7 +251,7 @@ def test_make_model_config_json_for_torch_script():
         model_id=model_id,
     )
 
-    test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model5.save_as_pt(sentences=["today is sunny"])
     model_config_path_torch = test_model5.make_model_config_json(
         model_format="TORCH_SCRIPT", verbose=True
     )
@@ -267,6 +267,38 @@ def test_make_model_config_json_for_torch_script():
     clean_test_folder(TEST_FOLDER)
 
 
+def test_make_model_config_json_set_path_for_torch_script():
+    model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
+    model_format = "TORCH_SCRIPT"
+    expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and was designed for semantic search. It has been trained on 215M  pairs from diverse sources."
+    expected_model_config_data = {
+        "embedding_dimension": 384,
+        "pooling_mode": "MEAN",
+        "normalize_result": True,
+    }
+
+    clean_test_folder(TEST_FOLDER)
+    test_model5 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model5.save_as_pt(sentences=["today is sunny"])
+    model_config_path_torch = test_model5.make_model_config_json(
+        config_output_path=TEST_FOLDER, model_format="TORCH_SCRIPT", verbose=True
+    )
+
+    compare_model_config(
+        model_config_path_torch,
+        model_id,
+        model_format,
+        expected_model_description=expected_model_description,
+        expected_model_config_data=expected_model_config_data,
+    )
+
+    clean_test_folder(TEST_FOLDER)
+
+
 def test_make_model_config_json_for_onnx():
     model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2"
     model_format = "ONNX"
@@ -283,7 +315,7 @@ def test_make_model_config_json_for_onnx():
         model_id=model_id,
     )
 
-    test_model6.save_as_onnx(model_id=model_id)
+    test_model6.save_as_onnx()
     model_config_path_onnx = test_model6.make_model_config_json(model_format="ONNX")
 
     compare_model_config(
@@ -297,6 +329,38 @@ def test_make_model_config_json_for_onnx():
     clean_test_folder(TEST_FOLDER)
 
 
+def test_make_model_config_json_set_path_for_onnx():
+    model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2"
+    model_format = "ONNX"
+    expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search."
+    expected_model_config_data = {
+        "embedding_dimension": 384,
+        "pooling_mode": "MEAN",
+        "normalize_result": False,
+    }
+
+    clean_test_folder(TEST_FOLDER)
+    test_model6 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model6.save_as_onnx()
+    model_config_path_onnx = test_model6.make_model_config_json(
+        config_output_path=TEST_FOLDER, model_format="ONNX"
+    )
+
+    compare_model_config(
+        model_config_path_onnx,
+        model_id,
+        model_format,
+        expected_model_description=expected_model_description,
+        expected_model_config_data=expected_model_config_data,
+    )
+
+    clean_test_folder(TEST_FOLDER)
+
+
 def test_overwrite_fields_in_model_config():
     model_id = "sentence-transformers/all-distilroberta-v1"
     model_format = "TORCH_SCRIPT"
@@ -318,7 +382,7 @@ def test_overwrite_fields_in_model_config():
         model_id=model_id,
     )
 
-    test_model7.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model7.save_as_pt(sentences=["today is sunny"])
     model_config_path_torch = test_model7.make_model_config_json(
         model_format="TORCH_SCRIPT"
     )
@@ -337,7 +401,7 @@ def test_overwrite_fields_in_model_config():
         model_id=model_id,
     )
 
-    test_model8.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model8.save_as_pt(sentences=["today is sunny"])
     model_config_path_torch = test_model8.make_model_config_json(
         model_format="TORCH_SCRIPT",
         embedding_dimension=overwritten_model_config_data["embedding_dimension"],
@@ -367,7 +431,7 @@ def test_missing_readme_md_file():
         model_id=model_id,
     )
 
-    test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model9.save_as_pt(sentences=["today is sunny"])
     temp_path = os.path.join(
         TEST_FOLDER,
         "README.md",
@@ -403,7 +467,7 @@ def test_missing_expected_description_in_readme_file():
         model_id=model_id,
     )
 
-    test_model10.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model10.save_as_pt(sentences=["today is sunny"])
     temp_path = os.path.join(
         TEST_FOLDER,
         "README.md",
@@ -440,7 +504,7 @@ def test_overwrite_description():
         model_id=model_id,
     )
 
-    test_model11.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model11.save_as_pt(sentences=["today is sunny"])
     model_config_path_torch = test_model11.make_model_config_json(
         model_format=model_format, description=expected_model_description
     )
@@ -471,7 +535,7 @@ def test_long_description():
         model_id=model_id,
     )
 
-    test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model12.save_as_pt(sentences=["today is sunny"])
     model_config_path_torch = test_model12.make_model_config_json(
         model_format=model_format
     )
@@ -501,7 +565,7 @@ def test_truncation_parameter():
         model_id=model_id,
     )
 
-    test_model13.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model13.save_as_pt(sentences=["today is sunny"])
 
     tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
     try:
@@ -534,7 +598,7 @@ def test_undefined_model_max_length_in_tokenizer_for_torch_script():
         model_id=model_id,
     )
 
-    test_model14.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model14.save_as_pt(sentences=["today is sunny"])
 
     tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
     try:
@@ -563,7 +627,7 @@ def test_undefined_model_max_length_in_tokenizer_for_onnx():
         model_id=model_id,
     )
 
-    test_model14.save_as_onnx(model_id=model_id)
+    test_model14.save_as_onnx()
 
     tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
     try:
@@ -598,7 +662,6 @@ def test_save_as_pt_with_license():
     )
 
     test_model15.save_as_pt(
-        model_id=model_id,
         sentences=["today is sunny"],
         add_apache_license=True,
     )
@@ -622,7 +685,7 @@ def test_save_as_onnx_with_license():
         model_id=model_id,
     )
 
-    test_model16.save_as_onnx(model_id=model_id, add_apache_license=True)
+    test_model16.save_as_onnx(add_apache_license=True)
 
     compare_model_zip_file(onnx_zip_file_path, onnx_expected_filenames, model_format)
 
@@ -649,7 +712,7 @@ def test_zip_model_with_license():
         model_id=model_id,
     )
 
-    test_model17.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model17.save_as_pt(sentences=["today is sunny"])
     compare_model_zip_file(zip_file_path, expected_filenames_wo_license, model_format)
 
     test_model17.zip_model(add_apache_license=True)
@@ -658,5 +721,37 @@ def test_zip_model_with_license():
     clean_test_folder(TEST_FOLDER)
 
 
+def test_save_as_pt_model_with_different_id():
+    model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
+    model_id2 = "sentence-transformers/all-MiniLM-L6-v2"
+    model_format = "TORCH_SCRIPT"
+    zip_file_path = os.path.join(TEST_FOLDER, "msmarco-distilbert-base-tas-b.zip")
+    zip_file_path2 = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2.zip")
+    expected_filenames_wo_model_id = {
+        "msmarco-distilbert-base-tas-b.pt",
+        "tokenizer.json",
+    }
+    expected_filenames_with_model_id = {
+        "all-MiniLM-L6-v2.pt",
+        "tokenizer.json",
+    }
+
+    clean_test_folder(TEST_FOLDER)
+    test_model17 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model17.save_as_pt(sentences=["today is sunny"])
+    compare_model_zip_file(zip_file_path, expected_filenames_wo_model_id, model_format)
+
+    test_model17.save_as_pt(model_id=model_id2, sentences=["today is sunny"])
+    compare_model_zip_file(
+        zip_file_path2, expected_filenames_with_model_id, model_format
+    )
+
+    clean_test_folder(TEST_FOLDER)
+
+
 clean_test_folder(TEST_FOLDER)
 clean_test_folder(TESTDATA_UNZIP_FOLDER)