Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rationalise model_id requirements in SentenceTransformerModel #381

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
- use try-except-else block for handling unexpected exceptions during integration tests by @rawwar([#370](https://github.com/opensearch-project/opensearch-py-ml/pull/370))
- Removed pandas version pin in nox tests by @rawwar ([#368](https://github.com/opensearch-project/opensearch-py-ml/pull/368))
- Switch AL2 to AL2023 agent and DockerHub to ECR images in ml-models.JenkinsFile ([#377](https://github.com/opensearch-project/opensearch-py-ml/pull/377))
- Fix to ensure `model_id` is only required once when saving a model in `sentencetransformermodel.py` in ([#381](https://github.com/opensearch-project/opensearch-py-ml/pull/381))
- Add a `config_output_path` param in `make_model_config_json` in `sentencetransformermodel.py` to chose where the saving location of the model config json file. ([#381](https://github.com/opensearch-project/opensearch-py-ml/pull/381))

### Fixed
- Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))
Expand Down
27 changes: 20 additions & 7 deletions opensearch_py_ml/ml_models/sentencetransformermodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,9 +419,9 @@ def train_model(
"""

if model_id is None:
model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
model_id = self.model_id
if output_model_name is None:
output_model_name = str(self.model_id.split("/")[-1] + ".pt")
output_model_name = str(model_id.split("/")[-1] + ".pt")

# declare variables before assignment for training
corp_len = []
Expand Down Expand Up @@ -765,7 +765,7 @@ def _fill_null_truncation_field(
def save_as_pt(
self,
sentences: [str],
model_id="sentence-transformers/msmarco-distilbert-base-tas-b",
model_id: str = None,
model_name: str = None,
save_json_folder_path: str = None,
model_output_path: str = None,
Expand All @@ -780,7 +780,7 @@ def save_as_pt(
Required, for example sentences = ['today is sunny']
:type sentences: List of string [str]
:param model_id:
sentence transformer model id to download model from sentence transformers.
Optional, sentence transformer model id to download model from sentence transformers.
default model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
:type model_id: string
:param model_name:
Expand All @@ -806,6 +806,9 @@ def save_as_pt(
:rtype: string
"""

if model_id is None:
model_id = self.model_id

model = SentenceTransformer(model_id)

if model_name is None:
Expand Down Expand Up @@ -877,7 +880,7 @@ def save_as_pt(

def save_as_onnx(
self,
model_id="sentence-transformers/msmarco-distilbert-base-tas-b",
model_id: str = None,
model_name: str = None,
save_json_folder_path: str = None,
model_output_path: str = None,
Expand All @@ -889,7 +892,7 @@ def save_as_onnx(
zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster

:param model_id:
sentence transformer model id to download model from sentence transformers.
Optional, sentence transformer model id to download model from sentence transformers.
default model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
:type model_id: string
:param model_name:
Expand All @@ -915,6 +918,9 @@ def save_as_onnx(
:rtype: string
"""

if model_id is None:
model_id = self.model_id

model = SentenceTransformer(model_id)

if model_name is None:
Expand Down Expand Up @@ -1143,6 +1149,7 @@ def make_model_config_json(
model_name: str = None,
version_number: str = 1,
model_format: str = "TORCH_SCRIPT",
config_output_path: str = None,
model_zip_file_path: str = None,
embedding_dimension: int = None,
pooling_mode: str = None,
Expand All @@ -1163,6 +1170,10 @@ def make_model_config_json(
:param model_format:
Optional, the format of the model. Default is "TORCH_SCRIPT".
:type model_format: string
:param config_output_path:
Optional, path to save model config json file. If None, default as
default_folder_path from the constructor
:type config_output_path: string
:param model_zip_file_path:
Optional, path to the model zip file. Default is the zip file path used in save_as_pt or save_as_onnx
depending on model_format. This zip file is used to compute model_content_size_in_bytes and
Expand Down Expand Up @@ -1198,6 +1209,8 @@ def make_model_config_json(
:rtype: string
"""
folder_path = self.folder_path
if config_output_path is None:
config_output_path = self.folder_path
config_json_file_path = os.path.join(folder_path, "config.json")
if model_name is None:
model_name = self.model_id
Expand Down Expand Up @@ -1313,7 +1326,7 @@ def make_model_config_json(
print(json.dumps(model_config_content, indent=4))

model_config_file_path = os.path.join(
folder_path, "ml-commons_model_config.json"
config_output_path, "ml-commons_model_config.json"
)
os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True)
with open(model_config_file_path, "w") as file:
Expand Down
123 changes: 109 additions & 14 deletions tests/ml_models/test_sentencetransformermodel_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def test_make_model_config_json_for_torch_script():
model_id=model_id,
)

test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"])
test_model5.save_as_pt(sentences=["today is sunny"])
model_config_path_torch = test_model5.make_model_config_json(
model_format="TORCH_SCRIPT", verbose=True
)
Expand All @@ -267,6 +267,38 @@ def test_make_model_config_json_for_torch_script():
clean_test_folder(TEST_FOLDER)


def test_make_model_config_json_set_path_for_torch_script():
model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
model_format = "TORCH_SCRIPT"
expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources."
expected_model_config_data = {
"embedding_dimension": 384,
"pooling_mode": "MEAN",
"normalize_result": True,
}

clean_test_folder(TEST_FOLDER)
test_model5 = SentenceTransformerModel(
folder_path=TEST_FOLDER,
model_id=model_id,
)

test_model5.save_as_pt(sentences=["today is sunny"])
model_config_path_torch = test_model5.make_model_config_json(
config_output_path=TEST_FOLDER, model_format="TORCH_SCRIPT", verbose=True
)

compare_model_config(
model_config_path_torch,
model_id,
model_format,
expected_model_description=expected_model_description,
expected_model_config_data=expected_model_config_data,
)

clean_test_folder(TEST_FOLDER)


def test_make_model_config_json_for_onnx():
model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2"
model_format = "ONNX"
Expand All @@ -283,7 +315,7 @@ def test_make_model_config_json_for_onnx():
model_id=model_id,
)

test_model6.save_as_onnx(model_id=model_id)
test_model6.save_as_onnx()
model_config_path_onnx = test_model6.make_model_config_json(model_format="ONNX")

compare_model_config(
Expand All @@ -297,6 +329,38 @@ def test_make_model_config_json_for_onnx():
clean_test_folder(TEST_FOLDER)


def test_make_model_config_json_set_path_for_onnx():
model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2"
model_format = "ONNX"
expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search."
expected_model_config_data = {
"embedding_dimension": 384,
"pooling_mode": "MEAN",
"normalize_result": False,
}

clean_test_folder(TEST_FOLDER)
test_model6 = SentenceTransformerModel(
folder_path=TEST_FOLDER,
model_id=model_id,
)

test_model6.save_as_onnx()
model_config_path_onnx = test_model6.make_model_config_json(
config_output_path=TEST_FOLDER, model_format="ONNX"
)

compare_model_config(
model_config_path_onnx,
model_id,
model_format,
expected_model_description=expected_model_description,
expected_model_config_data=expected_model_config_data,
)

clean_test_folder(TEST_FOLDER)


def test_overwrite_fields_in_model_config():
model_id = "sentence-transformers/all-distilroberta-v1"
model_format = "TORCH_SCRIPT"
Expand All @@ -318,7 +382,7 @@ def test_overwrite_fields_in_model_config():
model_id=model_id,
)

test_model7.save_as_pt(model_id=model_id, sentences=["today is sunny"])
test_model7.save_as_pt(sentences=["today is sunny"])
model_config_path_torch = test_model7.make_model_config_json(
model_format="TORCH_SCRIPT"
)
Expand All @@ -337,7 +401,7 @@ def test_overwrite_fields_in_model_config():
model_id=model_id,
)

test_model8.save_as_pt(model_id=model_id, sentences=["today is sunny"])
test_model8.save_as_pt(sentences=["today is sunny"])
model_config_path_torch = test_model8.make_model_config_json(
model_format="TORCH_SCRIPT",
embedding_dimension=overwritten_model_config_data["embedding_dimension"],
Expand Down Expand Up @@ -367,7 +431,7 @@ def test_missing_readme_md_file():
model_id=model_id,
)

test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"])
test_model9.save_as_pt(sentences=["today is sunny"])
temp_path = os.path.join(
TEST_FOLDER,
"README.md",
Expand Down Expand Up @@ -403,7 +467,7 @@ def test_missing_expected_description_in_readme_file():
model_id=model_id,
)

test_model10.save_as_pt(model_id=model_id, sentences=["today is sunny"])
test_model10.save_as_pt(sentences=["today is sunny"])
temp_path = os.path.join(
TEST_FOLDER,
"README.md",
Expand Down Expand Up @@ -440,7 +504,7 @@ def test_overwrite_description():
model_id=model_id,
)

test_model11.save_as_pt(model_id=model_id, sentences=["today is sunny"])
test_model11.save_as_pt(sentences=["today is sunny"])
model_config_path_torch = test_model11.make_model_config_json(
model_format=model_format, description=expected_model_description
)
Expand Down Expand Up @@ -471,7 +535,7 @@ def test_long_description():
model_id=model_id,
)

test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"])
test_model12.save_as_pt(sentences=["today is sunny"])
model_config_path_torch = test_model12.make_model_config_json(
model_format=model_format
)
Expand Down Expand Up @@ -501,7 +565,7 @@ def test_truncation_parameter():
model_id=model_id,
)

test_model13.save_as_pt(model_id=model_id, sentences=["today is sunny"])
test_model13.save_as_pt(sentences=["today is sunny"])

tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
try:
Expand Down Expand Up @@ -534,7 +598,7 @@ def test_undefined_model_max_length_in_tokenizer_for_torch_script():
model_id=model_id,
)

test_model14.save_as_pt(model_id=model_id, sentences=["today is sunny"])
test_model14.save_as_pt(sentences=["today is sunny"])

tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
try:
Expand Down Expand Up @@ -563,7 +627,7 @@ def test_undefined_model_max_length_in_tokenizer_for_onnx():
model_id=model_id,
)

test_model14.save_as_onnx(model_id=model_id)
test_model14.save_as_onnx()

tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
try:
Expand Down Expand Up @@ -598,7 +662,6 @@ def test_save_as_pt_with_license():
)

test_model15.save_as_pt(
model_id=model_id,
sentences=["today is sunny"],
add_apache_license=True,
)
Expand All @@ -622,7 +685,7 @@ def test_save_as_onnx_with_license():
model_id=model_id,
)

test_model16.save_as_onnx(model_id=model_id, add_apache_license=True)
test_model16.save_as_onnx(add_apache_license=True)

compare_model_zip_file(onnx_zip_file_path, onnx_expected_filenames, model_format)

Expand All @@ -649,7 +712,7 @@ def test_zip_model_with_license():
model_id=model_id,
)

test_model17.save_as_pt(model_id=model_id, sentences=["today is sunny"])
test_model17.save_as_pt(sentences=["today is sunny"])
compare_model_zip_file(zip_file_path, expected_filenames_wo_license, model_format)

test_model17.zip_model(add_apache_license=True)
Expand All @@ -658,5 +721,37 @@ def test_zip_model_with_license():
clean_test_folder(TEST_FOLDER)


def test_save_as_pt_model_with_different_id():
model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
model_id2 = "sentence-transformers/all-MiniLM-L6-v2"
model_format = "TORCH_SCRIPT"
zip_file_path = os.path.join(TEST_FOLDER, "msmarco-distilbert-base-tas-b.zip")
zip_file_path2 = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2.zip")
expected_filenames_wo_model_id = {
"msmarco-distilbert-base-tas-b.pt",
"tokenizer.json",
}
expected_filenames_with_model_id = {
"all-MiniLM-L6-v2.pt",
"tokenizer.json",
}

clean_test_folder(TEST_FOLDER)
test_model17 = SentenceTransformerModel(
folder_path=TEST_FOLDER,
model_id=model_id,
)

test_model17.save_as_pt(sentences=["today is sunny"])
compare_model_zip_file(zip_file_path, expected_filenames_wo_model_id, model_format)

test_model17.save_as_pt(model_id=model_id2, sentences=["today is sunny"])
compare_model_zip_file(
zip_file_path2, expected_filenames_with_model_id, model_format
)

clean_test_folder(TEST_FOLDER)


clean_test_folder(TEST_FOLDER)
clean_test_folder(TESTDATA_UNZIP_FOLDER)