diff --git a/doc/changes/changes_0.8.0.md b/doc/changes/changes_0.8.0.md index 60a0d872..312e140c 100644 --- a/doc/changes/changes_0.8.0.md +++ b/doc/changes/changes_0.8.0.md @@ -11,6 +11,7 @@ This release added the get_language_definition function to the LanguageContainer - #174: Added get_language_definition to the language container deployer + ### Bug Fixes - n/a diff --git a/doc/changes/changes_0.9.0.md b/doc/changes/changes_0.9.0.md index 41fce9ab..e3abae49 100644 --- a/doc/changes/changes_0.9.0.md +++ b/doc/changes/changes_0.9.0.md @@ -9,7 +9,7 @@ T.B.D ### Features - - n/a +- #145: Added load function for loading local models ### Bug Fixes diff --git a/exasol_transformers_extension/utils/load_local_model.py b/exasol_transformers_extension/utils/load_local_model.py new file mode 100644 index 00000000..c1e10a28 --- /dev/null +++ b/exasol_transformers_extension/utils/load_local_model.py @@ -0,0 +1,66 @@ +import torch +import transformers.pipelines +from typing import Optional +from pathlib import Path +from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol + + +class LoadLocalModel: + """ + Class for loading locally saved models and tokenizers. Also stores information regarding the model and pipeline. + + :_pipeline_factory: a function to create a transformers pipeline + :task_name: name of the current task + :device: device to be used for pipeline creation + :_base_model_factory: a ModelFactoryProtocol for creating the loaded model + :_tokenizer_factory: a ModelFactoryProtocol for creating the loaded tokenizer + """ + def __init__(self, + _pipeline_factory, + task_name: str, + device: str, + base_model_factory: ModelFactoryProtocol, + tokenizer_factory: ModelFactoryProtocol + ): + self.pipeline_factory = _pipeline_factory + self.task_name = task_name + self.device = device + self._base_model_factory = base_model_factory + self._tokenizer_factory = tokenizer_factory + self._loaded_model_key = None + + @property + def loaded_model_key(self): + """Get the current loaded_model_key.""" + return self._loaded_model_key + + def load_models(self, + model_path: Path, + current_model_key: str + ) -> transformers.pipelines.Pipeline: + """ + Loads a locally saved model and tokenizer from "cache_dir / "pretrained" / model_name". + Returns new pipeline corresponding to the model and task. + + :model_path: location of the saved model and tokenizer + :current_model_key: key of the model to be loaded + """ + + loaded_model = self._base_model_factory.from_pretrained(str(model_path)) + loaded_tokenizer = self._tokenizer_factory.from_pretrained(str(model_path)) + + last_created_pipeline = self.pipeline_factory( + self.task_name, + model=loaded_model, + tokenizer=loaded_tokenizer, + device=self.device, + framework="pt") + self._loaded_model_key = current_model_key + return last_created_pipeline + + def clear_device_memory(self): + """ + Delete models and free device memory + """ + torch.cuda.empty_cache() + diff --git a/exasol_transformers_extension/utils/model_factory_protocol.py b/exasol_transformers_extension/utils/model_factory_protocol.py index 48fc390f..a87afccf 100644 --- a/exasol_transformers_extension/utils/model_factory_protocol.py +++ b/exasol_transformers_extension/utils/model_factory_protocol.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Protocol, Union, runtime_checkable +from typing import Protocol, Union, runtime_checkable, Optional import transformers @@ -9,8 +9,17 @@ class ModelFactoryProtocol(Protocol): """ Protocol for better type hints. """ - def from_pretrained(self, model_name: str, cache_dir: Path, use_auth_token: str) -> transformers.PreTrainedModel: + def from_pretrained(self, model_name: str, cache_dir: Optional[Path]=None, use_auth_token: Optional[str]=None) \ + -> transformers.PreTrainedModel: + """ + Either downloads a model from Huggingface Hub(all parameters required), + or loads a locally saved model from file (only requires filepath) + + :model_name: model name, or path to locally saved model files + :cache_dir: optional. Path where downloaded model should be cached + :use_auth_token: optional. token for Huggingface hub private models + """ pass def save_pretrained(self, save_directory: Union[str, Path]): - pass \ No newline at end of file + pass diff --git a/tests/integration_tests/without_db/utils/test_load_local_model.py b/tests/integration_tests/without_db/utils/test_load_local_model.py new file mode 100644 index 00000000..72c4c14f --- /dev/null +++ b/tests/integration_tests/without_db/utils/test_load_local_model.py @@ -0,0 +1,83 @@ +from pathlib import Path, PurePosixPath +from transformers import AutoModel, AutoTokenizer, pipeline +import tarfile + +from exasol_transformers_extension.utils.load_local_model import LoadLocalModel +from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol +from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import \ + HuggingFaceHubBucketFSModelTransferSPFactory +from exasol_bucketfs_utils_python.localfs_mock_bucketfs_location import \ + LocalFSMockBucketFSLocation + +from tests.utils.parameters import model_params + +import tempfile + + +class TestSetup: + def __init__(self): + + self.base_model_factory: ModelFactoryProtocol = AutoModel + self.tokenizer_factory: ModelFactoryProtocol = AutoTokenizer + + self.token = "token" + model_params_ = model_params.tiny_model + self.model_name = model_params_ + + self.mock_current_model_key = None + test_pipeline = pipeline + self.loader = LoadLocalModel( + test_pipeline, + task_name="token-classification", + device="cpu", + base_model_factory=self.base_model_factory, + tokenizer_factory=self.tokenizer_factory + ) + + +def download_model_with_huggingface_transfer(test_setup, mock_bucketfs_location): + model_transfer_factory = HuggingFaceHubBucketFSModelTransferSPFactory() + downloader = model_transfer_factory.create(bucketfs_location=mock_bucketfs_location, + model_name=test_setup.model_name, + model_path=Path("cached_files"), + token="") + downloader.download_from_huggingface_hub(test_setup.base_model_factory) + downloader.download_from_huggingface_hub(test_setup.tokenizer_factory) + bucketfs_model_path = downloader.upload_to_bucketfs() + + with tarfile.open(mock_bucketfs_location.base_path / bucketfs_model_path) as tar: + tar.extractall(path=mock_bucketfs_location.base_path / bucketfs_model_path.parent) + return mock_bucketfs_location.base_path / bucketfs_model_path.parent + + +def test_load_local_model(): + test_setup = TestSetup() + + with tempfile.TemporaryDirectory() as dir: + dir_p = Path(dir) + model_save_path = dir_p / "pretrained" / test_setup.model_name + # download a model + model = AutoModel.from_pretrained(test_setup.model_name) + tokenizer = AutoTokenizer.from_pretrained(test_setup.model_name) + model.save_pretrained(model_save_path) + tokenizer.save_pretrained(model_save_path) + + test_setup.loader.load_models(current_model_key=test_setup.mock_current_model_key, + model_path=dir_p / "pretrained" / test_setup.model_name) + + +def test_load_local_model_with_huggingface_model_transfer(): + test_setup = TestSetup() + + with tempfile.TemporaryDirectory() as dire: + dir_p = Path(dire) + + mock_bucketfs_location = LocalFSMockBucketFSLocation( + PurePosixPath(dir_p / "bucket")) + + # download a model + downloaded_model_path = download_model_with_huggingface_transfer( + test_setup, mock_bucketfs_location) + + test_setup.loader.load_models(current_model_key=test_setup.mock_current_model_key, + model_path=downloaded_model_path) diff --git a/tests/unit_tests/udfs/test_base_udf.py b/tests/unit_tests/udfs/test_base_udf.py index a43c21cb..c3dc932e 100644 --- a/tests/unit_tests/udfs/test_base_udf.py +++ b/tests/unit_tests/udfs/test_base_udf.py @@ -12,6 +12,7 @@ from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer import ModelFactoryProtocol from exasol_transformers_extension.utils.load_model import LoadModel from tests.utils.mock_cast import mock_cast +from tests.unit_tests.udf_wrapper_params.zero_shot.mock_zero_shot import MockPipeline import re @@ -80,7 +81,7 @@ def setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name): '', None) - mock_pipeline = lambda task_name, model, tokenizer, device, framework: None + mock_pipeline = Mock() mock_ctx = create_mock_udf_context(input_data, mock_meta) udf = DummyImplementationUDF(exa=mock_exa, base_model=mock_base_model_factory, diff --git a/tests/unit_tests/utils/test_load_local_model.py b/tests/unit_tests/utils/test_load_local_model.py new file mode 100644 index 00000000..45560d70 --- /dev/null +++ b/tests/unit_tests/utils/test_load_local_model.py @@ -0,0 +1,49 @@ +import tempfile +from pathlib import Path +from typing import Union +from unittest.mock import create_autospec, MagicMock, call, Mock + +import transformers + +from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol +from exasol_transformers_extension.utils.load_local_model import LoadLocalModel +from tests.unit_tests.udf_wrapper_params.zero_shot.mock_zero_shot import MockPipeline +from tests.utils.mock_cast import mock_cast + + +class TestSetup: + def __init__(self): + + self.model_factory_mock: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) + self.tokenizer_factory_mock: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) + self.token = "token" + self.model_name = "model_name" + self.mock_current_model_key = "some_key" + self.cache_dir = "test/Path" + + self.mock_pipeline = Mock() + self.loader = LoadLocalModel( + self.mock_pipeline, + task_name="test_task", + device="cpu", + base_model_factory=self.model_factory_mock, + tokenizer_factory=self.tokenizer_factory_mock) + + +def test_load_function_call(): + test_setup = TestSetup() + model_save_path = Path(test_setup.cache_dir) / "pretrained" / test_setup.model_name + + test_setup.loader.load_models(current_model_key=test_setup.mock_current_model_key, + model_path=model_save_path) + + assert test_setup.model_factory_mock.mock_calls == [ + call.from_pretrained(str(model_save_path))] + assert test_setup.tokenizer_factory_mock.mock_calls == [ + call.from_pretrained(str(model_save_path))] + assert test_setup.mock_pipeline.mock_calls == [ + call('test_task', + model=mock_cast(test_setup.model_factory_mock.from_pretrained).return_value, + tokenizer=mock_cast(test_setup.tokenizer_factory_mock.from_pretrained).return_value, + device='cpu', framework='pt')] +