diff --git a/.ci/opensearch/Dockerfile b/.ci/opensearch/Dockerfile index d3c3d5257..74fce3515 100755 --- a/.ci/opensearch/Dockerfile +++ b/.ci/opensearch/Dockerfile @@ -1,10 +1,18 @@ -ARG OPENSEARCH_VERSION +ARG OPENSEARCH_VERSION=latest FROM opensearchproject/opensearch:$OPENSEARCH_VERSION +# OPENSEARCH_VERSION needs to be redefined as any arg before FROM is outside build scope. +# Reference: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact +ARG OPENSEARCH_VERSION=latest ARG opensearch_path=/usr/share/opensearch ARG opensearch_yml=$opensearch_path/config/opensearch.yml ARG SECURE_INTEGRATION RUN echo "plugins.ml_commons.only_run_on_ml_node: false" >> $opensearch_yml; RUN echo "plugins.ml_commons.native_memory_threshold: 100" >> $opensearch_yml; +RUN if [ "$OPENSEARCH_VERSION" == "2.11.0" ] ; then \ + echo "plugins.ml_commons.model_access_control_enabled: true" >> $opensearch_yml; \ + echo "plugins.ml_commons.allow_registering_model_via_local_file: true" >> $opensearch_yml; \ + echo "plugins.ml_commons.allow_registering_model_via_url: true" >> $opensearch_yml; \ +fi RUN if [ "$SECURE_INTEGRATION" != "true" ] ; then echo "plugins.security.disabled: true" >> $opensearch_yml; fi diff --git a/.ci/run-opensearch.sh b/.ci/run-opensearch.sh index 54de7660c..4e1fcd3b4 100644 --- a/.ci/run-opensearch.sh +++ b/.ci/run-opensearch.sh @@ -4,7 +4,6 @@ # to form a cluster suitable for running the REST API tests. # # Export the NUMBER_OF_NODES variable to start more than 1 node - script_path=$(dirname $(realpath -s $0)) source $script_path/imports.sh set -euo pipefail diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index e36c77355..1640df98f 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -2,6 +2,10 @@ name: Integration tests on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: integration: name: Integ @@ -13,6 +17,7 @@ jobs: secured: ["true"] entry: - { opensearch_version: 2.7.0 } + - { opensearch_version: 2.11.0 } steps: - name: Checkout diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f7523160..4d68dbe63 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Add workflow and scripts for automating model listing updating process by @thanawan-atc in ([#210](https://github.com/opensearch-project/opensearch-py-ml/pull/210)) - Add script to trigger ml-models-release jenkins workflow with generic webhook by @thanawan-atc in ([#211](https://github.com/opensearch-project/opensearch-py-ml/pull/211)) - Add example notebook for tracing and registering a CLIPTextModel to OpenSearch with the Neural Search plugin by @patrickbarnhart in ([#283](https://github.com/opensearch-project/opensearch-py-ml/pull/283)) +- Add support for train api functionality by @rawwar in ([#310](https://github.com/opensearch-project/opensearch-py-ml/pull/310)) +- Add support for Model Access Control - Register, Update, Search and Delete by @rawwar in ([#332](https://github.com/opensearch-project/opensearch-py-ml/pull/332)) ### Changed - Modify ml-models.JenkinsFile so that it takes model format into account and can be triggered with generic webhook by @thanawan-atc in ([#211](https://github.com/opensearch-project/opensearch-py-ml/pull/211)) @@ -25,6 +27,9 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Update model upload history - sentence-transformers/distiluse-base-multilingual-cased-v1 (v.1.0.1)(TORCH_SCRIPT) by @dhrubo-os ([#281](https://github.com/opensearch-project/opensearch-py-ml/pull/281)) - Update pretrained_models_all_versions.json (2023-09-14 10:28:41) by @dhrubo-os ([#282](https://github.com/opensearch-project/opensearch-py-ml/pull/282)) - Enable the model upload workflow to add model_content_size_in_bytes & model_content_hash_value to model config automatically @thanawan-atc ([#291](https://github.com/opensearch-project/opensearch-py-ml/pull/291)) +- Update pretrained_models_all_versions.json (2023-10-18 18:11:34) by @dhrubo-os ([#322](https://github.com/opensearch-project/opensearch-py-ml/pull/322)) +- Update model upload history - sentence-transformers/paraphrase-mpnet-base-v2 (v.1.0.0)(BOTH) by @dhrubo-os ([#321](https://github.com/opensearch-project/opensearch-py-ml/pull/321)) +- Replaced usage of `is_datetime_or_timedelta_dtype` with `is_timedelta64_dtype` and `is_datetime64_any_dtype`([#316](https://github.com/opensearch-project/opensearch-py-ml/pull/316)) ### Fixed - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203)) @@ -37,6 +42,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Roll over pretrained_model_listing.json because of ml-commons dependency by @thanawan-atc in ([#252](https://github.com/opensearch-project/opensearch-py-ml/pull/252)) - Fix pandas dependency issue in nox session by installing pandas package to python directly by @thanawan-atc in ([#266](https://github.com/opensearch-project/opensearch-py-ml/pull/266)) - Fix conditional job execution issue in model upload workflow by @thanawan-atc in ([#294](https://github.com/opensearch-project/opensearch-py-ml/pull/294)) +- fix bug in `MLCommonClient_client.upload_model` by @rawwar in ([#336](https://github.com/opensearch-project/opensearch-py-ml/pull/336)) ## [1.1.0] diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index 186d4231d..335aef486 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -89,6 +89,9 @@ After navigating to OpenSearch Dashboards you should update the persistent setti You should paste this settings in the `Dev Tools` window and run it: + +For OpenSearch versions below 2.7 + ```yml PUT /_cluster/settings { @@ -101,6 +104,22 @@ You should paste this settings in the `Dev Tools` window and run it: } ``` +For OpenSearch versions 2.8 or above + +```yml + PUT /_cluster/settings + { + "persistent" : { + "plugins.ml_commons.only_run_on_ml_node" : false, + "plugins.ml_commons.native_memory_threshold" : 100, + "plugins.ml_commons.max_model_on_node": 20, + "plugins.ml_commons.enable_inhouse_python_model": true, + "plugins.ml_commons.allow_registering_model_via_local_file": true, + "plugins.ml_commons.allow_registering_model_via_url": true + } + } +``` + #### Review user tutorials to understand the key features and workflows - These [Notebook Examples](https://opensearch-project.github.io/opensearch-py-ml/examples/index.html) will show you how to use opensearch-py-ml for data exploration and machine learning. diff --git a/README.md b/README.md index ffea12e51..c5259c782 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ clusters using the [ml-commons](https://github.com/opensearch-project/ml-commons For more information, see [opensearch.org](https://opensearch.org/docs/latest/clients/opensearch-py-ml/) and the [API Doc](https://opensearch-project.github.io/opensearch-py-ml/index.html). -##Installing Opensearch-py-ml +## Installing Opensearch-py-ml Opensearch-py-ml can be installed from [PyPI](https://pypi.org/project/opensearch-py-ml) via pip: diff --git a/opensearch_py_ml/field_mappings.py b/opensearch_py_ml/field_mappings.py index 3dd5e2333..b3c83914a 100644 --- a/opensearch_py_ml/field_mappings.py +++ b/opensearch_py_ml/field_mappings.py @@ -43,10 +43,10 @@ from pandas.core.dtypes.common import is_bool_dtype # type: ignore from pandas.core.dtypes.common import ( is_datetime64_any_dtype, - is_datetime_or_timedelta_dtype, is_float_dtype, is_integer_dtype, is_string_dtype, + is_timedelta64_dtype, ) from pandas.core.dtypes.inference import is_list_like @@ -91,7 +91,9 @@ def is_numeric(self) -> bool: @property def is_timestamp(self) -> bool: - return is_datetime_or_timedelta_dtype(self.pd_dtype) + return is_datetime64_any_dtype(self.pd_dtype) or is_timedelta64_dtype( + self.pd_dtype + ) @property def is_bool(self) -> bool: @@ -509,7 +511,7 @@ def _pd_dtype_to_os_dtype(pd_dtype) -> Optional[str]: os_dtype = "boolean" elif is_string_dtype(pd_dtype): os_dtype = "keyword" - elif is_datetime_or_timedelta_dtype(pd_dtype): + elif is_datetime64_any_dtype(pd_dtype) or is_timedelta64_dtype(pd_dtype): os_dtype = "date" elif is_datetime64_any_dtype(pd_dtype): os_dtype = "date" @@ -794,7 +796,9 @@ def metric_source_fields( pd_dtypes.append(np.dtype(pd_dtype)) os_field_names.append(os_field_name) os_date_formats.append(os_date_format) - elif include_timestamp and is_datetime_or_timedelta_dtype(pd_dtype): + elif include_timestamp and ( + is_datetime64_any_dtype(pd_dtype) or is_timedelta64_dtype(pd_dtype) + ): pd_dtypes.append(np.dtype(pd_dtype)) os_field_names.append(os_field_name) os_date_formats.append(os_date_format) diff --git a/opensearch_py_ml/ml_commons/ml_commons_client.py b/opensearch_py_ml/ml_commons/ml_commons_client.py index b4198d10d..72e2e158b 100644 --- a/opensearch_py_ml/ml_commons/ml_commons_client.py +++ b/opensearch_py_ml/ml_commons/ml_commons_client.py @@ -8,7 +8,7 @@ import json import time -from typing import Any, List, Union +from typing import Any, List, Optional, Union from deprecated.sphinx import deprecated from opensearchpy import OpenSearch @@ -21,6 +21,7 @@ MODEL_VERSION_FIELD, TIMEOUT, ) +from opensearch_py_ml.ml_commons.model_access_control import ModelAccessControl from opensearch_py_ml.ml_commons.model_execute import ModelExecute from opensearch_py_ml.ml_commons.model_uploader import ModelUploader @@ -35,6 +36,7 @@ def __init__(self, os_client: OpenSearch): self._client = os_client self._model_uploader = ModelUploader(os_client) self._model_execute = ModelExecute(os_client) + self.model_access_control = ModelAccessControl(os_client) def execute(self, algorithm_name: str, input_json: dict) -> dict: """ @@ -96,7 +98,9 @@ def upload_model( :rtype: string """ model_id = self._model_uploader._register_model( - model_path, model_config_path, isVerbose + model_path=model_path, + model_meta_path=model_config_path, + isVerbose=isVerbose, ) # loading the model chunks from model index @@ -105,6 +109,26 @@ def upload_model( return model_id + def train_model( + self, algorithm_name: str, input_json: dict, is_async: Optional[bool] = False + ) -> dict: + """ + This method trains an ML Model + """ + + params = {} + if not isinstance(input_json, dict): + input_json = json.loads(input_json) + if is_async: + params["async"] = "true" + + return self._client.transport.perform_request( + method="POST", + url=f"{ML_BASE_URI}/_train/{algorithm_name}", + body=input_json, + params=params, + ) + def register_model( self, model_path: str, diff --git a/opensearch_py_ml/ml_commons/model_access_control.py b/opensearch_py_ml/ml_commons/model_access_control.py new file mode 100644 index 000000000..bae4e603e --- /dev/null +++ b/opensearch_py_ml/ml_commons/model_access_control.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +from typing import List, Optional + +from opensearchpy import OpenSearch +from opensearchpy.exceptions import NotFoundError + +from opensearch_py_ml.ml_commons.ml_common_utils import ML_BASE_URI +from opensearch_py_ml.ml_commons.validators.model_access_control import ( + validate_create_model_group_parameters, + validate_delete_model_group_parameters, + validate_search_model_group_parameters, + validate_update_model_group_parameters, +) + + +class ModelAccessControl: + API_ENDPOINT = "model_groups" + + def __init__(self, os_client: OpenSearch): + self.client = os_client + + def register_model_group( + self, + name: str, + description: Optional[str] = None, + access_mode: Optional[str] = "private", + backend_roles: Optional[List[str]] = None, + add_all_backend_roles: Optional[bool] = False, + ): + validate_create_model_group_parameters( + name, description, access_mode, backend_roles, add_all_backend_roles + ) + + body = {"name": name, "add_all_backend_roles": add_all_backend_roles} + if description: + body["description"] = description + if access_mode: + body["access_mode"] = access_mode + if backend_roles: + body["backend_roles"] = backend_roles + + return self.client.transport.perform_request( + method="POST", url=f"{ML_BASE_URI}/{self.API_ENDPOINT}/_register", body=body + ) + + def update_model_group( + self, + update_query: dict, + model_group_id: Optional[str] = None, + ): + validate_update_model_group_parameters(update_query, model_group_id) + return self.client.transport.perform_request( + method="PUT", + url=f"{ML_BASE_URI}/{self.API_ENDPOINT}/{model_group_id}", + body=update_query, + ) + + def search_model_group(self, query: dict): + validate_search_model_group_parameters(query) + return self.client.transport.perform_request( + method="GET", url=f"{ML_BASE_URI}/{self.API_ENDPOINT}/_search", body=query + ) + + def search_model_group_by_name( + self, + model_group_name: str, + _source: Optional[List] = None, + size: Optional[int] = 1, + ): + query = {"query": {"match": {"name": model_group_name}}, "size": size} + if _source: + query["_source"] = _source + return self.search_model_group(query) + + def get_model_group_id_by_name(self, model_group_name: str): + try: + res = self.search_model_group_by_name(model_group_name) + if res["hits"]["hits"]: + return res["hits"]["hits"][0]["_id"] + else: + raise NotFoundError + except NotFoundError: + print(f"No model group found with name:{model_group_name}") + return None + except Exception as ex: + print(f"Error in get_model_group_id_by_name: {ex}") + return None + + def delete_model_group(self, model_group_id: str): + validate_delete_model_group_parameters(model_group_id) + return self.client.transport.perform_request( + method="DELETE", url=f"{ML_BASE_URI}/{self.API_ENDPOINT}/{model_group_id}" + ) + + def delete_model_group_by_name(self, model_group_name: str): + model_group_id = self.get_model_group_id_by_name(model_group_name) + if model_group_id is None: + raise NotFoundError(f"Model group {model_group_name} not found") + return self.delete_model_group(model_group_id=model_group_id) diff --git a/opensearch_py_ml/ml_commons/validators/__init__.py b/opensearch_py_ml/ml_commons/validators/__init__.py new file mode 100644 index 000000000..8d89f2583 --- /dev/null +++ b/opensearch_py_ml/ml_commons/validators/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. diff --git a/opensearch_py_ml/ml_commons/validators/model_access_control.py b/opensearch_py_ml/ml_commons/validators/model_access_control.py new file mode 100644 index 000000000..2fb928e25 --- /dev/null +++ b/opensearch_py_ml/ml_commons/validators/model_access_control.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +""" Module for validating model access control parameters """ + +from typing import List, Optional + +ACCESS_MODES = ["public", "private", "restricted"] + +NoneType = type(None) + + +def _validate_model_group_name(name: str): + if not name or not isinstance(name, str): + raise ValueError("name is required and needs to be a string") + + +def _validate_model_group_description(description: Optional[str]): + if not isinstance(description, (NoneType, str)): + raise ValueError("description needs to be a string") + + +def _validate_model_group_access_mode(access_mode: Optional[str]): + if access_mode is None: + return + if access_mode not in ACCESS_MODES: + raise ValueError(f"access_mode can must be in {ACCESS_MODES} or None") + + +def _validate_model_group_backend_roles(backend_roles: Optional[List[str]]): + if not isinstance(backend_roles, (NoneType, list)): + raise ValueError("backend_roles should either be None or a list of roles names") + + +def _validate_model_group_add_all_backend_roles(add_all_backend_roles: Optional[bool]): + if not isinstance(add_all_backend_roles, (NoneType, bool)): + raise ValueError("add_all_backend_roles should be a boolean") + + +def _validate_model_group_query(query: dict, operation: Optional[str] = None): + if not isinstance(query, dict): + raise ValueError("query needs to be a dictionary") + + if operation and not isinstance(operation, str): + raise ValueError("operation needs to be a string") + + +def validate_create_model_group_parameters( + name: str, + description: Optional[str] = None, + access_mode: Optional[str] = "private", + backend_roles: Optional[List[str]] = None, + add_all_backend_roles: Optional[bool] = False, +): + _validate_model_group_name(name) + _validate_model_group_description(description) + _validate_model_group_access_mode(access_mode) + _validate_model_group_backend_roles(backend_roles) + _validate_model_group_add_all_backend_roles(add_all_backend_roles) + + if access_mode == "restricted": + if not backend_roles and not add_all_backend_roles: + raise ValueError( + "You must specify either backend_roles or add_all_backend_roles=True for restricted access_mode" + ) + + if backend_roles and add_all_backend_roles: + raise ValueError( + "You cannot specify both backend_roles and add_all_backend_roles=True at the same time" + ) + + elif access_mode == "private": + if backend_roles or add_all_backend_roles: + raise ValueError( + "You must not specify backend_roles or add_all_backend_roles=True for a private model group" + ) + + +def validate_update_model_group_parameters(update_query: dict, model_group_id: str): + if not isinstance(model_group_id, str): + raise ValueError("Invalid model_group_id. model_group_id needs to be a string") + + if not isinstance(update_query, dict): + raise ValueError("Invalid update_query. update_query needs to be a dictionary") + + +def validate_delete_model_group_parameters(model_group_id: str): + if not isinstance(model_group_id, str): + raise ValueError("Invalid model_group_id. model_group_id needs to be a string") + + +def validate_search_model_group_parameters(query: dict): + _validate_model_group_query(query) diff --git a/opensearch_py_ml/ml_models/__init__.py b/opensearch_py_ml/ml_models/__init__.py index 3ec96ebd5..eacea92f0 100644 --- a/opensearch_py_ml/ml_models/__init__.py +++ b/opensearch_py_ml/ml_models/__init__.py @@ -7,5 +7,6 @@ from .metrics_correlation.mcorr import MCorr from .sentencetransformermodel import SentenceTransformerModel +from .question_answering_model import QuestionAnsweringModel -__all__ = ["SentenceTransformerModel", "MCorr"] +__all__ = ["SentenceTransformerModel", "MCorr", "QuestionAnsweringModel"] diff --git a/opensearch_py_ml/ml_models/question_answering_model.py b/opensearch_py_ml/ml_models/question_answering_model.py new file mode 100644 index 000000000..fcafde68c --- /dev/null +++ b/opensearch_py_ml/ml_models/question_answering_model.py @@ -0,0 +1,542 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +import json +import os +import pickle +import platform +import random +import re +import shutil +import subprocess +import time +from pathlib import Path +from typing import List +from zipfile import ZipFile + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import requests +import torch +import yaml +from accelerate import Accelerator, notebook_launcher +from mdutils.fileutils import MarkDownFile +# from sentence_transformers import SentenceTransformer +# from sentence_transformers.models import Normalize, Pooling, Transformer +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoTokenizer, AutoModelForQuestionAnswering +import transformers + + +from opensearch_py_ml.ml_commons.ml_common_utils import ( + _generate_model_content_hash_value, +) + +LICENSE_URL = "https://github.com/opensearch-project/opensearch-py-ml/raw/main/LICENSE" + + +class QuestionAnsweringModel: + """ + Class for tracing the QuestionAnswering model. + """ + # distilbert-base-cased-distilled-squad + DEFAULT_MODEL_ID = "distilbert-base-cased-distilled-squad" + SYNTHETIC_QUERY_FOLDER = "synthetic_queries" + + def __init__( + self, + model_id: str = DEFAULT_MODEL_ID, + folder_path: str = None, + overwrite: bool = False, + ) -> None: + """ + Initiate a question answering model class object. The model id will be used to download + pretrained model from the hugging-face and served as the default name for model files, and the folder_path + will be the default location to store files generated in the following functions + + :param model_id: Optional, the huggingface mode id to download the model, + default model id: 'distilbert-base-cased-distilled-squad' + :type model_id: string + :param folder_path: Optional, the path of the folder to save output files, such as queries, pre-trained model, + after-trained custom model and configuration files. if None, default as "/model_files/" under the current + work directory + :type folder_path: string + :param overwrite: Optional, choose to overwrite the folder at folder path. Default as false. When training + different question answering models, it's recommended to give designated folder path every time. + Users can choose to overwrite = True to overwrite previous runs + :type overwrite: bool + :return: no return value expected + :rtype: None + """ + default_folder_path = os.path.join( + os.getcwd(), "question_answering_model_files" + ) + + if folder_path is None: + self.folder_path = default_folder_path + else: + self.folder_path = folder_path + + # Check if self.folder_path exists + if os.path.exists(self.folder_path) and not overwrite: + print( + "To prevent overwriting, please enter a different folder path or delete the folder or enable " + "overwrite = True " + ) + raise Exception( + str("The default folder path already exists at : " + self.folder_path) + ) + + self.model_id = model_id + self.torch_script_zip_file_path = None + self.onnx_zip_file_path = None + + + def save_as_pt( + self, + sentences: [str] = ["today is sunny"], + model_id="distilbert-base-cased-distilled-squad", + model_name: str = None, + save_json_folder_path: str = None, + model_output_path: str = None, + zip_file_name: str = None, + add_apache_license: bool = False, + ) -> str: + """ + Download the model directly from huggingface, convert model to torch script format, + zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster + + :param sentences: + Required, for example sentences = ['today is sunny'] + :type sentences: List of string [str] + :param model_id: + question answering model id to download model from question answerings. + default model_id = "distilbert-base-cased-distilled-squad" + :type model_id: string + :param model_name: + Optional, model name to name the model file, e.g, "sample_model.pt". If None, default takes the + model_id and add the extension with ".pt" + :type model_name: string + :param save_json_folder_path: + Optional, path to save model json file, e.g, "home/save_pre_trained_model_json/"). If None, default as + default_folder_path from the constructor + :type save_json_folder_path: string + :param model_output_path: + Optional, path to save traced model zip file. If None, default as + default_folder_path from the constructor + :type model_output_path: string + :param zip_file_name: + Optional, file name for zip file. e.g, "sample_model.zip". If None, default takes the model_id + and add the extension with ".zip" + :type zip_file_name: string + :param add_apache_license: + Optional, whether to add a Apache-2.0 license file to model zip file + :type add_apache_license: string + :return: model zip file path. The file path where the zip file is being saved + :rtype: string + """ + + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = AutoModelForQuestionAnswering.from_pretrained(model_id) + + if model_name is None: + model_name = str(model_id.split("/")[-1] + ".pt") + + model_path = os.path.join(self.folder_path, model_name) + + if save_json_folder_path is None: + save_json_folder_path = self.folder_path + + if model_output_path is None: + model_output_path = self.folder_path + + if zip_file_name is None: + zip_file_name = str(model_id.split("/")[-1] + ".zip") + zip_file_path = os.path.join(model_output_path, zip_file_name) + + # handle when model_max_length is unproperly defined in model's tokenizer (e.g. "intfloat/e5-small-v2") + # MODEL_MAX_SEQ_LENGTH = 512 + # if tokenizer.model_max_length > model.get_max_seq_length(): + # tokenizer.model_max_length = model.get_max_seq_length() + # print( + # f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {tokenizer.model_max_length}" + # ) + + # save tokenizer.json in save_json_folder_name + # max_position_embeddings + + # AutoTokenizer will save tokenizer.json in save_json_folder_name + # DistilBertTokenizer will save it in cache: /Users/faradawn/.cache/huggingface/hub/models/... + tokenizer.save_pretrained(save_json_folder_path) + tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") + # Open the tokenizer.json and replace the truncation field + with open(tokenizer_file_path) as user_file: + parsed_json = json.load(user_file) + + if "truncation" not in parsed_json or parsed_json["truncation"] is None: + parsed_json["truncation"] = { + "direction": "Right", + "max_length": tokenizer.model_max_length, + "strategy": "LongestFirst", + "stride": 0, + } + + tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") + with open(tokenizer_file_path, "w") as file: + json.dump(parsed_json, file, indent=2) + + + # convert to pt format will need to be in cpu, + # set the device to cpu, convert its input_ids and attention_mask in cpu and save as .pt format + device = torch.device("cpu") + cpu_model = model.to(device) + features = tokenizer( + sentences, return_tensors="pt", padding=True, truncation=True + ).to(device) + + compiled_model = torch.jit.trace( + cpu_model, + (features["input_ids"], features["attention_mask"]), + strict=False + ) + torch.jit.save(compiled_model, model_path) + print("Traced torchscript model is saved to ", model_path) + + # zip model file along with tokenizer.json (and license file) as output + with ZipFile(str(zip_file_path), "w") as zipObj: + zipObj.write( + model_path, + arcname=str(model_name), + ) + zipObj.write( + os.path.join(save_json_folder_path, "tokenizer.json"), + arcname="tokenizer.json", + ) + if add_apache_license: + self._add_apache_license_to_model_zip_file(zip_file_path) + + self.torch_script_zip_file_path = zip_file_path + print("zip file is saved to ", zip_file_path, "\n") + return zip_file_path + + def save_as_onnx( + self, + model_id="distilbert-base-cased-distilled-squad", + model_name: str = None, + save_json_folder_path: str = None, + model_output_path: str = None, + zip_file_name: str = None, + add_apache_license: bool = False, + ) -> str: + """ + Download question answering model directly from huggingface, convert model to onnx format, + zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster + + :param model_id: + question answering model id to download model from question answerings. + default model_id = "distilbert-base-cased-distilled-squad" + :type model_id: string + :param model_name: + Optional, model name to name the model file, e.g, "sample_model.pt". If None, default takes the + model_id and add the extension with ".pt" + :type model_name: string + :param save_json_folder_path: + Optional, path to save model json file, e.g, "home/save_pre_trained_model_json/"). If None, default as + default_folder_path from the constructor + :type save_json_folder_path: string + :param model_output_path: + Optional, path to save traced model zip file. If None, default as + default_folder_path from the constructor + :type model_output_path: string + :param zip_file_name: + Optional, file name for zip file. e.g, "sample_model.zip". If None, default takes the model_id + and add the extension with ".zip" + :type zip_file_name: string + :param add_apache_license: + Optional, whether to add a Apache-2.0 license file to model zip file + :type add_apache_license: string + :return: model zip file path. The file path where the zip file is being saved + :rtype: string + """ + + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = AutoModelForQuestionAnswering.from_pretrained(model_id) + + if model_name is None: + model_name = str(model_id.split("/")[-1] + ".onnx") + + model_path = os.path.join(self.folder_path, model_name) + + if save_json_folder_path is None: + save_json_folder_path = self.folder_path + + if model_output_path is None: + model_output_path = self.folder_path + + if zip_file_name is None: + zip_file_name = str(model_id.split("/")[-1] + ".zip") + zip_file_path = os.path.join(model_output_path, zip_file_name) + + # save tokenizer.json in save_json_folder_name + tokenizer.save_pretrained(save_json_folder_path) + + # Find the tokenizer.json file path in cache: /Users/faradawn/.cache/huggingface/hub/models/... + tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") + + # Open the tokenizer.json and replace the truncation field + with open(tokenizer_file_path) as user_file: + parsed_json = json.load(user_file) + + if "truncation" not in parsed_json or parsed_json["truncation"] is None: + parsed_json["truncation"] = { + "direction": "Right", + "max_length": tokenizer.model_max_length, + "strategy": "LongestFirst", + "stride": 0, + } + + # Save tokenizer + tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json") + with open(tokenizer_file_path, "w") as file: + json.dump(parsed_json, file, indent=2) + + # load config + model_kind, model_onnx_config = transformers.onnx.FeaturesManager.check_supported_model_or_raise(model, feature="question-answering") + onnx_config = model_onnx_config(model.config) + + # export + onnx_inputs, onnx_outputs = transformers.onnx.export( + preprocessor=tokenizer, + model=model, + config=onnx_config, + opset=13, + output=Path(model_path) + ) + + print("Traced onnx model is saved to ", model_path) + + # zip model file along with tokenizer.json (and license file) as output + with ZipFile(str(zip_file_path), "w") as zipObj: + zipObj.write( + model_path, + arcname=str(model_name), + ) + zipObj.write( + os.path.join(save_json_folder_path, "tokenizer.json"), + arcname="tokenizer.json", + ) + if add_apache_license: + self._add_apache_license_to_model_zip_file(zip_file_path) + + self.onnx_zip_file_path = zip_file_path + print("zip file is saved to ", zip_file_path, "\n") + return zip_file_path + + + + def make_model_config_json( + self, + model_name: str = None, + version_number: str = 1, + model_format: str = "TORCH_SCRIPT", + model_zip_file_path: str = None, + embedding_dimension: int = None, + pooling_mode: str = None, + normalize_result: bool = None, + description: str = None, + all_config: str = None, + model_type: str = None, + verbose: bool = False, + ) -> str: + """ + Parse from config.json file of pre-trained hugging-face model to generate a ml-commons_model_config.json file. + If all required fields are given by users, use the given parameters and will skip reading the config.json + + :param model_name: + Optional, The name of the model. If None, default is model id, for example, + 'distilbert-base-cased-distilled-squad' + :type model_name: string + :param model_format: + Optional, the format of the model. Default is "TORCH_SCRIPT". + :type model_format: string + :param model_zip_file_path: + Optional, path to the model zip file. Default is the zip file path used in save_as_pt or save_as_onnx + depending on model_format. This zip file is used to compute model_content_size_in_bytes and + model_content_hash_value. + :type model_zip_file_path: string + :param version_number: + Optional, The version number of the model. Default is 1 + :type version_number: string + :param embedding_dimension: Optional, the embedding dimension of the model. If None, get embedding_dimension + from the pre-trained hugging-face model object. + :type embedding_dimension: int + :param pooling_mode: Optional, the pooling mode of the model. If None, get pooling_mode + from the pre-trained hugging-face model object. + :type pooling_mode: string + :param normalize_result: Optional, whether to normalize the result of the model. If None, check from the pre-trained + hugging-face model object. + :type normalize_result: bool + :param description: Optional, the description of the model. If None, get description from the README.md + file in the model folder. + :type description: str + :param all_config: + Optional, the all_config of the model. If None, parse all contents from the config file of pre-trained + hugging-face model + :type all_config: dict + :param model_type: + Optional, the model_type of the model. If None, parse model_type from the config file of pre-trained + hugging-face model + :type model_type: string + :param verbose: + optional, use printing more logs. Default as false + :type verbose: bool + :return: model config file path. The file path where the model config file is being saved + :rtype: string + """ + folder_path = self.folder_path + config_json_file_path = os.path.join(folder_path, "config.json") + if model_name is None: + model_name = self.model_id + + # if user input model_type/embedding_dimension/pooling_mode, it will skip this step. + model = AutoModelForQuestionAnswering.from_pretrained(self.model_id) + model.save_pretrained(self.folder_path) + + + # fill the empty fields + if ( + model_type is None + or embedding_dimension is None + or pooling_mode is None + or normalize_result is None + ): + try: + if embedding_dimension is None: + embedding_dimension = model.config.dim + if model_type is None: + model_type = "distilbert" + if pooling_mode is None: + pooling_mode = "CLS" + if normalize_result is None: + normalize_result = False + + # for str_idx, module in model._modules.items(): + # print(f"=== idx {str_idx}, module name {module.__class__.__name__}, module {module}") + # if model_type is None and isinstance(module, Transformer): + # model_type = module.auto_model.__class__.__name__ + # model_type = model_type.lower().rstrip("model") + # elif pooling_mode is None and isinstance(module, Pooling): + # pooling_mode = module.get_pooling_mode_str().upper() + # elif normalize_result is None and isinstance(module, Normalize): + # normalize_result = True + # TODO: Support 'Dense' module + + except Exception as e: + raise Exception( + f"Raised exception while getting model data from pre-trained hugging-face model object: {e}" + ) + + # fill the description + if description is None: + readme_file_path = os.path.join(self.folder_path, "README.md") + if os.path.exists(readme_file_path): + try: + if verbose: + print("reading README.md file") + description = self._get_model_description_from_readme_file( + readme_file_path + ) + except Exception as e: + print(f"Cannot scrape model description from README.md file: {e}") + description = self._generate_default_model_description( + embedding_dimension + ) + else: + print("Using default model description") + description = "This is a question-answering model: it provides answers to a question and context." + + # dump the config.json file + if all_config is None: + if not os.path.exists(config_json_file_path): + raise Exception( + str( + "Cannot find config.json in" + + config_json_file_path + + ". Please check the config.son file in the path." + ) + ) + try: + with open(config_json_file_path) as f: + if verbose: + print("reading config file from: " + config_json_file_path) + config_content = json.load(f) + if all_config is None: + all_config = config_content + except IOError: + print( + "Cannot open in config.json file at ", + config_json_file_path, + ". Please check the config.json ", + "file in the path.", + ) + + model_config_content = { + "name": model_name, + "version": version_number, + "description": description, + "model_format": model_format, + "model_task_type": "QUESTION_ANSWERING", + "model_config": { + "model_type": model_type, + "embedding_dimension": embedding_dimension, + "framework_type": "sentence_transformers", + "pooling_mode": pooling_mode, + "normalize_result": normalize_result, + "all_config": json.dumps(all_config), + }, + } + + # get model size and hash value + if model_zip_file_path is None: + model_zip_file_path = ( + self.torch_script_zip_file_path + if model_format == "TORCH_SCRIPT" + else self.onnx_zip_file_path + ) + + # model_zip_file_path = '/Users/faradawn/CS/opensearch-py-ml/opensearch_py_ml/ml_models/question-model-folder/distilbert-base-cased-distilled-squad.zip' + + if model_zip_file_path is None: + print( + "The model configuration JSON file currently lacks the 'model_content_size_in_bytes' and 'model_content_hash_value' fields. You can include these fields by specifying the 'model_zip_file_path' parameter. Failure to do so may result in the model registration process encountering issues." + ) + else: + model_config_content["model_content_size_in_bytes"] = os.stat( + model_zip_file_path + ).st_size + model_config_content[ + "model_content_hash_value" + ] = _generate_model_content_hash_value(model_zip_file_path) + + if verbose: + print("generating ml-commons_model_config.json file...\n") + print(json.dumps(model_config_content, indent=4)) + + model_config_file_path = os.path.join( + folder_path, "ml-commons_model_config.json" + ) + os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True) + with open(model_config_file_path, "w") as file: + json.dump(model_config_content, file, indent=4) + print( + "ml-commons_model_config.json file is saved at : ", model_config_file_path + ) + + return model_config_file_path + diff --git a/requirements-dev.txt b/requirements-dev.txt index b51ae0e21..16640f779 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # # Basic requirements # -pandas>=1.5.2,<3 +pandas>=1.5.2,<2 matplotlib>=3.6.2,<4 numpy>=1.24.0,<2 opensearch-py>=2.2.0 diff --git a/requirements.txt b/requirements.txt index ab798460d..ab4e94821 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # # Basic requirements # -pandas>=1.5.2,<3 +pandas>=1.5.2,<2 matplotlib>=3.6.2,<4 numpy>=1.24.0,<2 opensearch-py>=2.2.0 diff --git a/tests/ml_commons/test_ml_commons_client.py b/tests/ml_commons/test_ml_commons_client.py index 10be2c164..27cd79dc9 100644 --- a/tests/ml_commons/test_ml_commons_client.py +++ b/tests/ml_commons/test_ml_commons_client.py @@ -7,9 +7,14 @@ import os import shutil +import time +from json import JSONDecodeError from os.path import exists -from opensearchpy import OpenSearch +import pytest +from opensearchpy import OpenSearch, helpers +from opensearchpy.exceptions import RequestError +from sklearn.datasets import load_iris from opensearch_py_ml.ml_commons import MLCommonClient from opensearch_py_ml.ml_commons.model_uploader import ModelUploader @@ -44,6 +49,55 @@ PRETRAINED_MODEL_FORMAT = "TORCH_SCRIPT" +@pytest.fixture +def iris_index(): + index_name = "test__index__iris_data" + index_mapping = { + "mappings": { + "properties": { + "sepal_length": {"type": "float"}, + "sepal_width": {"type": "float"}, + "petal_length": {"type": "float"}, + "petal_width": {"type": "float"}, + "species": {"type": "keyword"}, + } + } + } + + if ml_client._client.indices.exists(index=index_name): + ml_client._client.indices.delete(index=index_name) + ml_client._client.indices.create(index=index_name, body=index_mapping) + + iris = load_iris() + iris_data = iris.data + iris_target = iris.target + iris_species = [iris.target_names[i] for i in iris_target] + + actions = [ + { + "_index": index_name, + "_source": { + "sepal_length": sepal_length, + "sepal_width": sepal_width, + "petal_length": petal_length, + "petal_width": petal_width, + "species": species, + }, + } + for (sepal_length, sepal_width, petal_length, petal_width), species in zip( + iris_data, iris_species + ) + ] + + helpers.bulk(ml_client._client, actions) + # without the sleep, test is failing. + time.sleep(2) + + yield index_name + + ml_client._client.indices.delete(index=index_name) + + def clean_test_folder(TEST_FOLDER): if os.path.exists(TEST_FOLDER): for files in os.listdir(TEST_FOLDER): @@ -72,6 +126,44 @@ def test_init(): assert isinstance(ml_client._model_uploader, ModelUploader) +def test_train(iris_index): + algorithm_name = "kmeans" + input_json_sync = { + "parameters": {"centroids": 3, "iterations": 10, "distance_type": "COSINE"}, + "input_query": { + "_source": ["petal_length", "petal_width"], + "size": 10000, + }, + "input_index": [iris_index], + } + response = ml_client.train_model(algorithm_name, input_json_sync) + assert isinstance(response, dict) + assert "model_id" in response + assert "status" in response + assert response["status"] == "COMPLETED" + + input_json_async = { + "parameters": {"centroids": 3, "iterations": 10, "distance_type": "COSINE"}, + "input_query": { + "_source": ["petal_length", "petal_width"], + "size": 10000, + }, + "input_index": [iris_index], + } + response = ml_client.train_model(algorithm_name, input_json_async, is_async=True) + + assert isinstance(response, dict) + assert "task_id" in response + assert "status" in response + assert response["status"] == "CREATED" + + with pytest.raises(JSONDecodeError): + ml_client.train_model(algorithm_name, "", is_async=True) + + with pytest.raises(RequestError): + ml_client.train_model(algorithm_name, {}, is_async=True) + + def test_execute(): raised = False try: @@ -100,102 +192,6 @@ def test_execute(): ), "Raised Exception during execute API testing with JSON string" -def test_search(): - # Search task cases - raised = False - try: - search_task_obj = ml_client.search_task( - input_json='{"query": {"match_all": {}},"size": 1}' - ) - assert search_task_obj["hits"]["hits"] != [] - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in searching task" - - raised = False - try: - search_task_obj = ml_client.search_task( - input_json={"query": {"match_all": {}}, "size": 1} - ) - assert search_task_obj["hits"]["hits"] != [] - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in searching task" - - raised = False - try: - search_task_obj = ml_client.search_task(input_json=15) - assert search_task_obj == "Invalid JSON object passed as argument." - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in searching task" - - raised = False - try: - search_task_obj = ml_client.search_task(input_json="15") - assert search_task_obj == "Invalid JSON object passed as argument." - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in searching task" - - raised = False - try: - search_task_obj = ml_client.search_task( - input_json='{"query": {"match_all": {}},size: 1}' - ) - assert search_task_obj == "Invalid JSON string passed as argument." - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in searching task" - - # Search model cases - raised = False - try: - search_model_obj = ml_client.search_model( - input_json='{"query": {"match_all": {}},"size": 1}' - ) - assert search_model_obj["hits"]["hits"] != [] - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in searching model" - - raised = False - try: - search_model_obj = ml_client.search_model( - input_json={"query": {"match_all": {}}, "size": 1} - ) - assert search_model_obj["hits"]["hits"] != [] - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in searching model" - - raised = False - try: - search_model_obj = ml_client.search_model(input_json=15) - assert search_model_obj == "Invalid JSON object passed as argument." - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in searching model" - - raised = False - try: - search_model_obj = ml_client.search_model(input_json="15") - assert search_model_obj == "Invalid JSON object passed as argument." - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in searching model" - - raised = False - try: - search_model_obj = ml_client.search_model( - input_json='{"query": {"match_all": {}},size: 1}' - ) - assert search_model_obj == "Invalid JSON string passed as argument." - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in searching model" - - def test_DEPRECATED_integration_pretrained_model_upload_unload_delete(): raised = False try: @@ -304,7 +300,6 @@ def test_DEPRECATED_integration_model_train_upload_full_cycle(): assert model_file_exists == True assert model_config_file_exists == True if model_file_exists and model_config_file_exists: - raised = False model_id = "" task_id = "" try: @@ -312,12 +307,10 @@ def test_DEPRECATED_integration_model_train_upload_full_cycle(): MODEL_PATH, MODEL_CONFIG_FILE_PATH, load_model=False, isVerbose=True ) print("Model_id:", model_id) - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception during model registration" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred when uploading model: {ex}" if model_id: - raised = False try: ml_load_status = ml_client.load_model(model_id, wait_until_loaded=False) task_id = ml_load_status.get("task_id") @@ -325,21 +318,17 @@ def test_DEPRECATED_integration_model_train_upload_full_cycle(): ml_model_status = ml_client.get_model_info(model_id) assert ml_model_status.get("model_state") != "DEPLOY_FAILED" - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in model deployment" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred when loading model: {ex}" - raised = False try: ml_model_status = ml_client.get_model_info(model_id) assert ml_model_status.get("model_format") == "TORCH_SCRIPT" assert ml_model_status.get("algorithm") == "TEXT_EMBEDDING" - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in getting model info" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred when getting model info: {ex}" if task_id: - raised = False ml_task_status = None try: ml_task_status = ml_client.get_task_info( @@ -348,47 +337,39 @@ def test_DEPRECATED_integration_model_train_upload_full_cycle(): assert ml_task_status.get("task_type") == "DEPLOY_MODEL" print("State:", ml_task_status.get("state")) assert ml_task_status.get("state") != "FAILED" - except: # noqa: E722 - print("Model Task Status:", ml_task_status) - raised = True - assert raised == False, "Raised Exception in pulling task info" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred when getting task info: {ex}" # This is test is being flaky. Sometimes the test is passing and sometimes showing 500 error # due to memory circuit breaker. # Todo: We need to revisit this test. try: - raised = False sentences = ["First test sentence", "Second test sentence"] embedding_result = ml_client.generate_embedding(model_id, sentences) print(embedding_result) assert len(embedding_result.get("inference_results")) == 2 - except: # noqa: E722 - raised = True - assert ( - raised == False - ), "Raised Exception in generating sentence embedding" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred when generating embedding: {ex}" try: delete_task_obj = ml_client.delete_task(task_id) assert delete_task_obj.get("result") == "deleted" - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in deleting task" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred when deleting task: {ex}" try: ml_client.unload_model(model_id) ml_model_status = ml_client.get_model_info(model_id) assert ml_model_status.get("model_state") != "UNDEPLOY_FAILED" - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in model undeployment" + except Exception as ex: # noqa: E722 + assert ( + False + ), f"Exception occurred when pretrained model undeployment : {ex}" - raised = False try: delete_model_obj = ml_client.delete_model(model_id) assert delete_model_obj.get("result") == "deleted" - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in deleting model" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred when deleting model: {ex}" def test_integration_model_train_register_full_cycle(): @@ -412,7 +393,6 @@ def test_integration_model_train_register_full_cycle(): task_id = "" # Testing deploy_model = True for codecov/patch - raised = False try: ml_client.register_model( model_path=MODEL_PATH, @@ -420,11 +400,9 @@ def test_integration_model_train_register_full_cycle(): deploy_model=True, isVerbose=True, ) - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception during first model registration" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred during first model registration: {ex}" - raised = False try: model_id = ml_client.register_model( model_path=MODEL_PATH, @@ -433,12 +411,10 @@ def test_integration_model_train_register_full_cycle(): isVerbose=True, ) print("Model_id:", model_id) - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception during second model registration" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred during second model registration: {ex}" if model_id: - raised = False try: ml_load_status = ml_client.deploy_model( model_id, wait_until_deployed=False @@ -448,21 +424,17 @@ def test_integration_model_train_register_full_cycle(): ml_model_status = ml_client.get_model_info(model_id) assert ml_model_status.get("model_state") != "DEPLOY_FAILED" - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in model deployment" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred during model deployment: {ex}" - raised = False try: ml_model_status = ml_client.get_model_info(model_id) assert ml_model_status.get("model_format") == "TORCH_SCRIPT" assert ml_model_status.get("algorithm") == "TEXT_EMBEDDING" - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in getting model info" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred when getting model info: {ex}" if task_id: - raised = False ml_task_status = None try: ml_task_status = ml_client.get_task_info( @@ -471,48 +443,133 @@ def test_integration_model_train_register_full_cycle(): assert ml_task_status.get("task_type") == "DEPLOY_MODEL" print("State:", ml_task_status.get("state")) assert ml_task_status.get("state") != "FAILED" - except: # noqa: E722 - print("Model Task Status:", ml_task_status) - raised = True - assert raised == False, "Raised Exception in pulling task info" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred in pulling task info: {ex}" # This is test is being flaky. Sometimes the test is passing and sometimes showing 500 error # due to memory circuit breaker. # Todo: We need to revisit this test. try: - raised = False sentences = ["First test sentence", "Second test sentence"] embedding_result = ml_client.generate_embedding(model_id, sentences) print(embedding_result) assert len(embedding_result.get("inference_results")) == 2 - except: # noqa: E722 - raised = True - assert ( - raised == False - ), "Raised Exception in generating sentence embedding" + except Exception as ex: # noqa: E722 + assert ( + False + ), f"Exception occurred when generating sentence embedding: {ex}" try: delete_task_obj = ml_client.delete_task(task_id) assert delete_task_obj.get("result") == "deleted" - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in deleting task" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred when deleting task: {ex}" try: ml_client.undeploy_model(model_id) ml_model_status = ml_client.get_model_info(model_id) assert ml_model_status.get("model_state") != "UNDEPLOY_FAILED" - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in model undeployment" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred during model undeployment : {ex}" - raised = False try: delete_model_obj = ml_client.delete_model(model_id) assert delete_model_obj.get("result") == "deleted" - except: # noqa: E722 - raised = True - assert raised == False, "Raised Exception in deleting model" + except Exception as ex: # noqa: E722 + assert False, f"Exception occurred during model deletion : {ex}" + + +def test_search(): + # Search task cases + raised = False + try: + search_task_obj = ml_client.search_task( + input_json='{"query": {"match_all": {}},"size": 1}' + ) + assert search_task_obj["hits"]["hits"] != [] + except: # noqa: E722 + raised = True + assert raised == False, "Raised Exception in searching task" + + raised = False + try: + search_task_obj = ml_client.search_task( + input_json={"query": {"match_all": {}}, "size": 1} + ) + assert search_task_obj["hits"]["hits"] != [] + except: # noqa: E722 + raised = True + assert raised == False, "Raised Exception in searching task" + + raised = False + try: + search_task_obj = ml_client.search_task(input_json=15) + assert search_task_obj == "Invalid JSON object passed as argument." + except: # noqa: E722 + raised = True + assert raised == False, "Raised Exception in searching task" + raised = False + try: + search_task_obj = ml_client.search_task(input_json="15") + assert search_task_obj == "Invalid JSON object passed as argument." + except: # noqa: E722 + raised = True + assert raised == False, "Raised Exception in searching task" -test_integration_model_train_register_full_cycle() + raised = False + try: + search_task_obj = ml_client.search_task( + input_json='{"query": {"match_all": {}},size: 1}' + ) + assert search_task_obj == "Invalid JSON string passed as argument." + except: # noqa: E722 + raised = True + assert raised == False, "Raised Exception in searching task" + + # Search model cases + raised = False + try: + search_model_obj = ml_client.search_model( + input_json='{"query": {"match_all": {}},"size": 1}' + ) + assert search_model_obj["hits"]["hits"] != [] + except: # noqa: E722 + raised = True + assert raised == False, "Raised Exception in searching model" + + raised = False + try: + search_model_obj = ml_client.search_model( + input_json={"query": {"match_all": {}}, "size": 1} + ) + assert search_model_obj["hits"]["hits"] != [] + except: # noqa: E722 + raised = True + assert raised == False, "Raised Exception in searching model" + + raised = False + try: + search_model_obj = ml_client.search_model(input_json=15) + assert search_model_obj == "Invalid JSON object passed as argument." + except: # noqa: E722 + raised = True + assert raised == False, "Raised Exception in searching model" + + raised = False + try: + search_model_obj = ml_client.search_model(input_json="15") + assert search_model_obj == "Invalid JSON object passed as argument." + except: # noqa: E722 + raised = True + assert raised == False, "Raised Exception in searching model" + + raised = False + try: + search_model_obj = ml_client.search_model( + input_json='{"query": {"match_all": {}},size: 1}' + ) + assert search_model_obj == "Invalid JSON string passed as argument." + except: # noqa: E722 + raised = True + assert raised == False, "Raised Exception in searching model" diff --git a/tests/ml_commons/test_model_access_control.py b/tests/ml_commons/test_model_access_control.py new file mode 100644 index 000000000..8666baacb --- /dev/null +++ b/tests/ml_commons/test_model_access_control.py @@ -0,0 +1,252 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +import os +import time +from unittest.mock import patch + +import pytest +from opensearchpy.exceptions import NotFoundError, RequestError +from packaging.version import parse as parse_version + +from opensearch_py_ml.ml_commons.model_access_control import ModelAccessControl +from tests import OPENSEARCH_TEST_CLIENT + +OPENSEARCH_VERSION = parse_version(os.environ.get("OPENSEARCH_VERSION", "2.11.0")) + +# MAC = Model Access Control +# Minimum opensearch version that supports Model Access Control. +MAC_MIN_VERSION = parse_version("2.8.0") + +# Minimum Opensearch version that supports Model group updates +MAC_UPDATE_MIN_VERSION = parse_version("2.11.0") + + +@pytest.fixture +def client(): + return ModelAccessControl(OPENSEARCH_TEST_CLIENT) + + +def _safe_delete_model_group(client, model_group_name): + try: + client.delete_model_group_by_name(model_group_name=model_group_name) + except NotFoundError: + pass + + +@pytest.fixture +def test_model_group(client): + model_group_name = "__test__model_group_1" + _safe_delete_model_group(client, model_group_name) + time.sleep(2) + client.register_model_group( + name=model_group_name, + description="test model group for opensearch-py-ml test cases", + ) + yield model_group_name + + _safe_delete_model_group(client, model_group_name) + + +@pytest.mark.skipif( + OPENSEARCH_VERSION < MAC_MIN_VERSION, + reason="Model groups are supported in OpenSearch 2.8.0 and above", +) +def test_register_model_group(client): + model_group_name1 = "__test__model_group_A" + try: + _safe_delete_model_group(client, model_group_name1) + time.sleep(2) + res = client.register_model_group(name=model_group_name1) + assert isinstance(res, dict) + assert "model_group_id" in res + assert "status" in res + assert res["status"] == "CREATED" + except Exception as ex: + assert False, f"Failed to register model group due to {ex}" + + model_group_name2 = "__test__model_group_B" + + try: + _safe_delete_model_group(client, model_group_name2) + time.sleep(2) + res = client.register_model_group( + name=model_group_name2, + description="test", + access_mode="restricted", + backend_roles=["admin"], + ) + assert "model_group_id" in res + assert "status" in res + assert res["status"] == "CREATED" + except Exception as ex: + assert False, f"Failed to register restricted model group due to {ex}" + + model_group_name3 = "__test__model_group_C" + with pytest.raises(RequestError) as exec_info: + _safe_delete_model_group(client, model_group_name3) + time.sleep(2) + res = client.register_model_group( + name=model_group_name3, + description="test", + access_mode="restricted", + add_all_backend_roles=True, + ) + assert exec_info.value.status_code == 400 + assert exec_info.match("Admin users cannot add all backend roles to a model group") + + with pytest.raises(RequestError) as exec_info: + client.register_model_group(name=model_group_name2) + assert exec_info.value.status_code == 400 + assert exec_info.match( + "The name you provided is already being used by a model group" + ) + + +@pytest.mark.skipif( + OPENSEARCH_VERSION < MAC_MIN_VERSION, + reason="Model groups are supported in OpenSearch 2.8.0 and above", +) +def test_get_model_group_id_by_name(client, test_model_group): + model_group_id = client.get_model_group_id_by_name(test_model_group) + assert model_group_id is not None + + model_group_id = client.get_model_group_id_by_name("test-unknown") + assert model_group_id is None + + # Mock NotFoundError as it only happens when index isn't created + with patch.object(client, "search_model_group_by_name", side_effect=NotFoundError): + model_group_id = client.get_model_group_id_by_name(test_model_group) + assert model_group_id is None + + +@pytest.mark.skipif( + OPENSEARCH_VERSION < MAC_UPDATE_MIN_VERSION, + reason="Model groups updates are supported in OpenSearch 2.11.0 and above", +) +def test_update_model_group(client, test_model_group): + # update model group name and description + update_query = { + "description": "updated description", + } + try: + model_group_id = client.get_model_group_id_by_name(test_model_group) + if model_group_id is None: + raise Exception(f"No model group found with the name: {test_model_group}") + res = client.update_model_group(update_query, model_group_id=model_group_id) + assert isinstance(res, dict) + assert "status" in res + assert res["status"] == "Updated" + except Exception as ex: + assert False, f"Failed to search model group due to unhandled error: {ex}" + + +@pytest.mark.skipif( + OPENSEARCH_VERSION < MAC_MIN_VERSION, + reason="Model groups are supported in OpenSearch 2.8.0 and above", +) +def test_search_model_group(client, test_model_group): + query1 = {"query": {"match": {"name": test_model_group}}, "size": 1} + try: + res = client.search_model_group(query1) + assert isinstance(res, dict) + assert "hits" in res and "hits" in res["hits"] + assert len(res["hits"]["hits"]) == 1 + assert "_source" in res["hits"]["hits"][0] + assert "name" in res["hits"]["hits"][0]["_source"] + assert test_model_group == res["hits"]["hits"][0]["_source"]["name"] + except Exception as ex: + assert False, f"Failed to search model group due to unhandled error: {ex}" + + query2 = {"query": {"match": {"name": "test-unknown"}}, "size": 1} + try: + res = client.search_model_group(query2) + assert isinstance(res, dict) + assert "hits" in res and "hits" in res["hits"] + assert len(res["hits"]["hits"]) == 0 + except Exception as ex: + assert False, f"Failed to search model group due to unhandled error: {ex}" + + +@pytest.mark.skipif( + OPENSEARCH_VERSION < MAC_MIN_VERSION, + reason="Model groups are supported in OpenSearch 2.8.0 and above", +) +def test_search_model_group_by_name(client, test_model_group): + try: + res = client.search_model_group_by_name(model_group_name=test_model_group) + assert isinstance(res, dict) + assert "hits" in res and "hits" in res["hits"] + assert len(res["hits"]["hits"]) == 1 + assert "_source" in res["hits"]["hits"][0] + assert len(res["hits"]["hits"][0]["_source"]) > 1 + assert "name" in res["hits"]["hits"][0]["_source"] + assert test_model_group == res["hits"]["hits"][0]["_source"]["name"] + except Exception as ex: + assert False, f"Failed to search model group due to unhandled error: {ex}" + + try: + res = client.search_model_group_by_name( + model_group_name=test_model_group, _source="name" + ) + assert isinstance(res, dict) + assert "hits" in res and "hits" in res["hits"] + assert len(res["hits"]["hits"]) == 1 + assert "_source" in res["hits"]["hits"][0] + assert len(res["hits"]["hits"][0]["_source"]) == 1 + assert "name" in res["hits"]["hits"][0]["_source"] + except Exception as ex: + assert False, f"Failed to search model group due to unhandled error: {ex}" + + try: + res = client.search_model_group_by_name(model_group_name="test-unknown") + assert isinstance(res, dict) + assert "hits" in res and "hits" in res["hits"] + assert len(res["hits"]["hits"]) == 0 + except Exception as ex: + assert False, f"Failed to search model group due to unhandled error: {ex}" + + +@pytest.mark.skipif( + OPENSEARCH_VERSION < MAC_MIN_VERSION, + reason="Model groups are supported in OpenSearch 2.8.0 and above", +) +def test_delete_model_group(client, test_model_group): + # create a test model group + + for each in "AB": + model_group_name = f"__test__model_group_{each}" + model_group_id = client.get_model_group_id_by_name(model_group_name) + if model_group_id is None: + continue + res = client.delete_model_group(model_group_id=model_group_id) + assert res is None or isinstance(res, dict) + if isinstance(res, dict): + assert "result" in res + assert res["result"] in ["not_found", "deleted"] + + res = client.delete_model_group(model_group_id="test-unknown") + assert isinstance(res, dict) + assert "result" in res + assert res["result"] == "not_found" + + +@pytest.mark.skipif( + OPENSEARCH_VERSION < MAC_MIN_VERSION, + reason="Model groups are supported in OpenSearch 2.8.0 and above", +) +def test_delete_model_group_by_name(client): + with pytest.raises(NotFoundError): + client.delete_model_group_by_name(model_group_name="test-unknown") + + model_group = "__test__model_group_5" + client.register_model_group(name=model_group) + time.sleep(2) + res = client.delete_model_group_by_name(model_group_name=model_group) + assert isinstance(res, dict) + assert "result" in res + assert res["result"] == "deleted" diff --git a/tests/ml_commons/test_validators/test_model_access_control_validators.py b/tests/ml_commons/test_validators/test_model_access_control_validators.py new file mode 100644 index 000000000..d5701e702 --- /dev/null +++ b/tests/ml_commons/test_validators/test_model_access_control_validators.py @@ -0,0 +1,148 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +import pytest + +from opensearch_py_ml.ml_commons.validators.model_access_control import ( + _validate_model_group_access_mode, + _validate_model_group_add_all_backend_roles, + _validate_model_group_backend_roles, + _validate_model_group_description, + _validate_model_group_name, + _validate_model_group_query, + validate_create_model_group_parameters, + validate_delete_model_group_parameters, + validate_search_model_group_parameters, + validate_update_model_group_parameters, +) + + +def test_validate_model_group_name(): + with pytest.raises(ValueError): + _validate_model_group_name(None) + + with pytest.raises(ValueError): + _validate_model_group_name("") + + with pytest.raises(ValueError): + _validate_model_group_name(123) + + res = _validate_model_group_name("ValidName") + assert res is None + + +def test_validate_model_group_description(): + with pytest.raises(ValueError): + _validate_model_group_description(123) + + res = _validate_model_group_description("") + assert res is None + + res = _validate_model_group_description(None) + assert res is None + + res = _validate_model_group_description("ValidName") + assert res is None + + +def test_validate_model_group_access_mode(): + with pytest.raises(ValueError): + _validate_model_group_access_mode(123) + + res = _validate_model_group_access_mode("private") + assert res is None + + res = _validate_model_group_access_mode("restricted") + assert res is None + + res = _validate_model_group_access_mode(None) + assert res is None + + +def test_validate_model_group_backend_roles(): + with pytest.raises(ValueError): + _validate_model_group_backend_roles(123) + + res = _validate_model_group_backend_roles(["admin"]) + assert res is None + + res = _validate_model_group_backend_roles(None) + assert res is None + + +def test_validate_model_group_add_all_backend_roles(): + with pytest.raises(ValueError): + _validate_model_group_add_all_backend_roles(123) + + res = _validate_model_group_add_all_backend_roles(False) + assert res is None + + res = _validate_model_group_add_all_backend_roles(True) + assert res is None + + res = _validate_model_group_add_all_backend_roles(None) + assert res is None + + +def test_validate_model_group_query(): + with pytest.raises(ValueError): + _validate_model_group_query(123) + + res = _validate_model_group_query({}) + assert res is None + + with pytest.raises(ValueError): + _validate_model_group_query(None) + + res = _validate_model_group_query({"query": {"match": {"name": "test"}}}) + assert res is None + + with pytest.raises(ValueError): + _validate_model_group_query({}, 123) + + +def test_validate_create_model_group_parameters(): + with pytest.raises(ValueError): + validate_create_model_group_parameters(123) + + res = validate_create_model_group_parameters("test") + assert res is None + + with pytest.raises(ValueError): + validate_create_model_group_parameters("test", access_mode="restricted") + + with pytest.raises(ValueError): + validate_create_model_group_parameters( + "test", access_mode="private", add_all_backend_roles=True + ) + + +def test_validate_update_model_group_parameters(): + with pytest.raises(ValueError): + validate_update_model_group_parameters(123, 123) + + with pytest.raises(ValueError): + validate_update_model_group_parameters(123, "id") + + res = validate_update_model_group_parameters({"query": {}}, "test") + assert res is None + + +def test_validate_delete_model_group_parameters(): + with pytest.raises(ValueError): + validate_delete_model_group_parameters(123) + + res = validate_delete_model_group_parameters("test") + assert res is None + + +def test_validate_search_model_group_parameters(): + with pytest.raises(ValueError): + validate_search_model_group_parameters(123) + + res = validate_search_model_group_parameters({"query": {}}) + assert res is None diff --git a/tests/ml_models/test_question_answering.py b/tests/ml_models/test_question_answering.py new file mode 100644 index 000000000..3399b292e --- /dev/null +++ b/tests/ml_models/test_question_answering.py @@ -0,0 +1,73 @@ + +from opensearch_py_ml.ml_models import QuestionAnsweringModel + +# Save our model as pt or onnx +model_id = "distilbert-base-cased-distilled-squad" +folder_path = "question-model-folder" +our_pre_trained_model = QuestionAnsweringModel(model_id=model_id, folder_path=folder_path, overwrite=True) +# zip_file_path = our_pre_trained_model.save_as_pt(model_id=model_id, sentences=["for example providing a small sentence", "we can add multiple sentences"]) +zip_file_path = our_pre_trained_model.save_as_onnx(model_id=model_id) + +# List of questions to test +questions = ["Who was Jim Henson?", "Where do I live?", "What's my name?"] +contexts = ["Jim Henson was a nice puppet", "My name is Sarah and I live in London", "My name is Clara and I live in Berkeley."] + +# Obtain pytorch's official model +from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering +import torch +tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased-distilled-squad') +official_model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad') + +def official_model_answer(question, context): + inputs = tokenizer(question, context, return_tensors="pt") + with torch.no_grad(): + outputs = official_model(**inputs) + answer_start_index = torch.argmax(outputs.start_logits, dim=-1).item() + answer_end_index = torch.argmax(outputs.end_logits, dim=-1).item() + predict_answer_tokens = inputs['input_ids'][0, answer_start_index : answer_end_index + 1] + official_answer = tokenizer.decode(predict_answer_tokens) + return official_answer + +def test_onnx(): + from transformers import AutoTokenizer + from onnxruntime import InferenceSession + import numpy as np + session = InferenceSession(f"{folder_path}/{model_id}.onnx") + + for i in range(len(questions)): + question = questions[i] + context = contexts[i] + inputs = tokenizer(question, context, return_tensors="pt") + print(f"=== test {i}, question: {question}, context: {context}") + + inputs = tokenizer(question, context, return_tensors="np") + outputs = session.run(output_names=["start_logits", "end_logits"], input_feed=dict(inputs)) + + answer_start_index = np.argmax(outputs[0], axis=-1).item() + answer_end_index = np.argmax(outputs[1], axis=-1).item() + predict_answer_tokens = inputs['input_ids'][0, answer_start_index : answer_end_index + 1] + answer = tokenizer.decode(predict_answer_tokens) + + print(f" Official answer: {official_model_answer(question, context)}") + print(f" Our answer: {answer}") + +def test_pt(): + traced_model = torch.jit.load(f"{folder_path}/{model_id}.pt") + + for i in range(len(questions)): + question = questions[i] + context = contexts[i] + inputs = tokenizer(question, context, return_tensors="pt") + print(f"=== test {i}, question: {question}, context: {context}") + + with torch.no_grad(): + outputs = traced_model(**inputs) + answer_start_index = torch.argmax(outputs["start_logits"], dim=-1).item() + answer_end_index = torch.argmax(outputs["end_logits"], dim=-1).item() + predict_answer_tokens = inputs['input_ids'][0, answer_start_index : answer_end_index + 1] + answer = tokenizer.decode(predict_answer_tokens) + + print(f" Official answer: {official_model_answer(question, context)}") + print(f" Our answer: {answer}") + +test_onnx() \ No newline at end of file diff --git a/tests/ml_models/test_question_answering_pytest.py b/tests/ml_models/test_question_answering_pytest.py new file mode 100644 index 000000000..f33c90196 --- /dev/null +++ b/tests/ml_models/test_question_answering_pytest.py @@ -0,0 +1,254 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. +# How to run: pytest tests/ml_models/test_question_answering_pytest.py + +import json +import os +import shutil +from zipfile import ZipFile + +import pytest + +from opensearch_py_ml.ml_models import QuestionAnsweringModel + +TEST_FOLDER = os.path.join( + os.path.dirname(os.path.abspath("__file__")), "tests", "test_model_files" +) +TESTDATA_FILENAME = os.path.join( + os.path.dirname(os.path.abspath("__file__")), "tests", "sample_zip.zip" +) +TESTDATA_UNZIP_FOLDER = os.path.join( + os.path.dirname(os.path.abspath("__file__")), "tests", "sample_zip" +) + +default_model_id = "distilbert-base-cased-distilled-squad" + +def clean_test_folder(TEST_FOLDER): + if os.path.exists(TEST_FOLDER): + for files in os.listdir(TEST_FOLDER): + sub_path = os.path.join(TEST_FOLDER, files) + if os.path.isfile(sub_path): + os.remove(sub_path) + else: + try: + shutil.rmtree(sub_path) + except OSError as err: + print( + "Fail to delete files, please delete all files in " + + str(TEST_FOLDER) + + " " + + str(err) + ) + + shutil.rmtree(TEST_FOLDER) + + +def compare_model_config( + model_config_path, + model_id, + model_format, + expected_model_description=None, + expected_model_config_data=None, +): + try: + with open(model_config_path) as json_file: + model_config_data = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in {model_format} raised an exception {exec}" + + assert ( + "name" in model_config_data and model_config_data["name"] == model_id + ), f"Missing or Wrong model name in {model_format} model config file" + + assert ( + "model_format" in model_config_data + and model_config_data["model_format"] == model_format + ), f"Missing or Wrong model_format in {model_format} model config file" + + if expected_model_description is not None: + assert ( + "description" in model_config_data + and model_config_data["description"] == expected_model_description + ), f"Missing or Wrong model description in {model_format} model config file'" + + if expected_model_config_data is not None: + assert ( + "model_config" in model_config_data + ), f"Missing 'model_config' in {model_format} model config file" + + if expected_model_config_data is not None: + for k, v in expected_model_config_data.items(): + assert ( + k in model_config_data["model_config"] + and model_config_data["model_config"][k] == v + ) or ( + k not in model_config_data["model_config"] + and k == "normalize_result" + and not v + ) + + assert ( + "model_content_size_in_bytes" in model_config_data + ), f"Missing 'model_content_size_in_bytes' in {model_format} model config file" + + assert ( + "model_content_hash_value" in model_config_data + ), f"Missing 'model_content_hash_value' in {model_format} model config file" + + +def compare_model_zip_file(zip_file_path, expected_filenames, model_format): + with ZipFile(zip_file_path, "r") as f: + filenames = set(f.namelist()) + assert ( + filenames == expected_filenames + ), f"The content in the {model_format} model zip file does not match the expected content: {filenames} != {expected_filenames}" + + + + + +def test_check_attribute(): + test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER) + try: + check_attribute = getattr(test_model, "model_id", "folder_path") + except AttributeError: + check_attribute = False + assert check_attribute + + assert test_model.folder_path == TEST_FOLDER + assert test_model.model_id == default_model_id + + default_folder = os.path.join(os.getcwd(), "question_answering_model_files") + + clean_test_folder(default_folder) + test_model0 = QuestionAnsweringModel() + assert test_model0.folder_path == default_folder + clean_test_folder(default_folder) + + clean_test_folder(TEST_FOLDER) + our_model_id = "distilbert-base-cased-distilled-squad" + test_model1 = QuestionAnsweringModel( + folder_path=TEST_FOLDER, model_id=our_model_id + ) + assert test_model1.model_id == our_model_id + + +def test_folder_path(): + with pytest.raises(Exception) as exc_info: + test_non_empty_path = os.path.join( + os.path.dirname(os.path.abspath("__file__")), "tests" + ) + QuestionAnsweringModel(folder_path=test_non_empty_path, overwrite=False) + assert exc_info.type is Exception + assert "The default folder path already exists" in exc_info.value.args[0] + + +# New tests for save_as_pt and save_as_onnx + +test_cases = [ + { + "question": "Who was Jim Henson?", + "context": "Jim Henson was a nice puppet" + }, + { + "question": "Where do I live?", + "context": "My name is Sarah and I live in London" + }, + { + "question": "What's my name?", + "context": "My name is Clara and I live in Berkeley." + }, + { + "question": "Which name is also used to describe the Amazon rainforest in English?", + "context": "The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain 'Amazonas' in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species." + } +] + +def get_official_answer(test_cases): + # Obtain pytorch's official model + from transformers import AutoTokenizer, AutoModelForQuestionAnswering + import torch + tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased-distilled-squad') + official_model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad') + + results = [] + + for case in test_cases: + question, context = case["question"], case["context"] + inputs = tokenizer(question, context, return_tensors="pt") + with torch.no_grad(): + outputs = official_model(**inputs) + answer_start_index = torch.argmax(outputs.start_logits, dim=-1).item() + answer_end_index = torch.argmax(outputs.end_logits, dim=-1).item() + results.append([answer_start_index, answer_end_index]) + + return results + +def get_pt_answer(test_cases, folder_path, model_id): + from transformers import AutoTokenizer, AutoModelForQuestionAnswering + import torch + tokenizer = AutoTokenizer.from_pretrained(model_id) + traced_model = torch.jit.load(f"{folder_path}/{model_id}.pt") + + results = [] + + for case in test_cases: + question, context = case["question"], case["context"] + inputs = tokenizer(question, context, return_tensors="pt") + + with torch.no_grad(): + outputs = traced_model(**inputs) + answer_start_index = torch.argmax(outputs["start_logits"], dim=-1).item() + answer_end_index = torch.argmax(outputs["end_logits"], dim=-1).item() + results.append([answer_start_index, answer_end_index]) + + return results + + +def get_onnx_answer(test_cases, folder_path, model_id): + from transformers import AutoTokenizer + from onnxruntime import InferenceSession + import numpy as np + session = InferenceSession(f"{folder_path}/{model_id}.onnx") + tokenizer = AutoTokenizer.from_pretrained(model_id) + + results = [] + + for case in test_cases: + question, context = case["question"], case["context"] + inputs = tokenizer(question, context, return_tensors="pt") + + inputs = tokenizer(question, context, return_tensors="np") + outputs = session.run(output_names=["start_logits", "end_logits"], input_feed=dict(inputs)) + + answer_start_index = np.argmax(outputs[0], axis=-1).item() + answer_end_index = np.argmax(outputs[1], axis=-1).item() + results.append([answer_start_index, answer_end_index]) + + return results + + +def test_pt_answer(): + test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER, overwrite=True) + test_model.save_as_pt(default_model_id) + pt_results = get_pt_answer(test_cases, TEST_FOLDER, default_model_id) + official_results = get_official_answer(test_cases) + for i in range(len(pt_results)): + assert pt_results[i] == official_results[i], f"Failed at index {i}: pt_results[{i}] ({pt_results[i]}) != official_results[{i}] ({official_results[i]})" + +def test_onnx_answer(): + test_model = QuestionAnsweringModel(folder_path=TEST_FOLDER, overwrite=True) + test_model.save_as_onnx(default_model_id) + onnx_results = get_onnx_answer(test_cases, TEST_FOLDER, default_model_id) + official_results = get_official_answer(test_cases) + for i in range(len(onnx_results)): + assert onnx_results[i] == official_results[i], f"Failed at index {i}: onnx_results[{i}] ({onnx_results[i]}) != official_results[{i}] ({official_results[i]})" + +clean_test_folder(TEST_FOLDER) +clean_test_folder(TESTDATA_UNZIP_FOLDER) \ No newline at end of file diff --git a/utils/model_uploader/model_listing/pretrained_models_all_versions.json b/utils/model_uploader/model_listing/pretrained_models_all_versions.json index adda4c17b..6a25040ad 100644 --- a/utils/model_uploader/model_listing/pretrained_models_all_versions.json +++ b/utils/model_uploader/model_listing/pretrained_models_all_versions.json @@ -113,6 +113,18 @@ } } }, + { + "name": "huggingface/sentence-transformers/paraphrase-mpnet-base-v2", + "versions": { + "1.0.0": { + "format": [ + "onnx", + "torch_script" + ], + "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search." + } + } + }, { "name": "huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "versions": { diff --git a/utils/model_uploader/upload_history/MODEL_UPLOAD_HISTORY.md b/utils/model_uploader/upload_history/MODEL_UPLOAD_HISTORY.md index 5496a9986..a6105fedf 100644 --- a/utils/model_uploader/upload_history/MODEL_UPLOAD_HISTORY.md +++ b/utils/model_uploader/upload_history/MODEL_UPLOAD_HISTORY.md @@ -19,3 +19,5 @@ The following table shows sentence transformer model upload history. |2023-08-31 15:22:19|@dhrubo-os|`sentence-transformers/msmarco-distilbert-base-tas-b`|1.0.2|ONNX|N/A|N/A|6042401385| |2023-08-31 15:22:19|@dhrubo-os|`sentence-transformers/msmarco-distilbert-base-tas-b`|1.0.2|TORCH_SCRIPT|N/A|N/A|6042401385| |2023-09-13 18:03:32|@dhrubo-os|`sentence-transformers/distiluse-base-multilingual-cased-v1`|1.0.1|TORCH_SCRIPT|N/A|N/A|6178024517| +|2023-10-18 18:06:15|@dhrubo-os|`sentence-transformers/paraphrase-mpnet-base-v2`|1.0.0|ONNX|N/A|N/A|6568285400| +|2023-10-18 18:06:15|@dhrubo-os|`sentence-transformers/paraphrase-mpnet-base-v2`|1.0.0|TORCH_SCRIPT|N/A|N/A|6568285400| diff --git a/utils/model_uploader/upload_history/supported_models.json b/utils/model_uploader/upload_history/supported_models.json index 916967614..ce09ec4c0 100644 --- a/utils/model_uploader/upload_history/supported_models.json +++ b/utils/model_uploader/upload_history/supported_models.json @@ -28,5 +28,25 @@ "Embedding Dimension": "N/A", "Pooling Mode": "N/A", "Workflow Run ID": "6178024517" + }, + { + "Model Uploader": "@dhrubo-os", + "Upload Time": "2023-10-18 18:06:15", + "Model ID": "sentence-transformers/paraphrase-mpnet-base-v2", + "Model Version": "1.0.0", + "Model Format": "ONNX", + "Embedding Dimension": "N/A", + "Pooling Mode": "N/A", + "Workflow Run ID": "6568285400" + }, + { + "Model Uploader": "@dhrubo-os", + "Upload Time": "2023-10-18 18:06:15", + "Model ID": "sentence-transformers/paraphrase-mpnet-base-v2", + "Model Version": "1.0.0", + "Model Format": "TORCH_SCRIPT", + "Embedding Dimension": "N/A", + "Pooling Mode": "N/A", + "Workflow Run ID": "6568285400" } ] \ No newline at end of file