diff --git a/doc/changes/changelog.md b/doc/changes/changelog.md index 4bbfdc25..ee00ce1c 100644 --- a/doc/changes/changelog.md +++ b/doc/changes/changelog.md @@ -1,5 +1,6 @@ # Changelog +* [1.0.0](changes_1.0.0.md) * [0.10.0](changes_0.10.0.md) * [0.9.2](changes_0.9.2.md) * [0.9.1](changes_0.9.1.md) diff --git a/doc/changes/changes_0.10.0.md b/doc/changes/changes_0.10.0.md index 84120c42..fbb98fab 100644 --- a/doc/changes/changes_0.10.0.md +++ b/doc/changes/changes_0.10.0.md @@ -20,4 +20,3 @@ Deploying SLC under Windows, releasing to PyPi. ### Security - diff --git a/doc/changes/changes_1.0.0.md b/doc/changes/changes_1.0.0.md new file mode 100644 index 00000000..afd0622e --- /dev/null +++ b/doc/changes/changes_1.0.0.md @@ -0,0 +1,17 @@ +# Transformers Extension 1.0.0, T.B.D + +Code name: T.B.D + + +## Summary +T.B.D + + +### Features + +- #146: Integrated new download and load functions using save_pretrained + +### Refactorings + + +### Security diff --git a/doc/user_guide/user_guide.md b/doc/user_guide/user_guide.md index f34c6599..f53a1973 100644 --- a/doc/user_guide/user_guide.md +++ b/doc/user_guide/user_guide.md @@ -229,9 +229,10 @@ Before you can use pre-trained models, the models must be stored in the BucketFS. We provide two different ways to load transformers models into BucketFS: + ### 1. Model Downloader UDF Using the `TE_MODEL_DOWNLOADER_UDF` below, you can download the desired model -from the huggingface hub and upload it to bucketfs. +from the huggingface hub and upload it to BucketFS. ```sql SELECT TE_MODEL_DOWNLOADER_UDF( @@ -274,6 +275,19 @@ models from the local filesystem into BucketFS: ``` *Note*: The options --local-model-path needs to point to a path which contains the model and its tokenizer. +These should have been saved using transformers [save_pretrained](https://huggingface.co/docs/transformers/v4.32.1/en/installation#fetch-models-and-tokenizers-to-use-offline) +function to ensure proper loading by the Transformers Extension UDFs. +You can download the model using python lke this: + +```python + for model_factory in [transformers.AutoModel, transformers.AutoTokenizer]: + # download the model an tokenizer from huggingface + model = model_factory.from_pretrained(model_name, cache_dir= / ) + # save the downloaded model using the save_pretrained fuction + model.save_pretrained( / "pretrained" / ) +``` +And then upload it using exasol_transformers_extension.upload_model script where ```local-model-path = / "pretrained" / ``` + ## Prediction UDFs We provided 7 prediction UDFs, each performing an NLP task through the [transformers API](https://huggingface.co/docs/transformers/task_summary). diff --git a/exasol_transformers_extension/udfs/models/base_model_udf.py b/exasol_transformers_extension/udfs/models/base_model_udf.py index 343db5a0..4c782639 100644 --- a/exasol_transformers_extension/udfs/models/base_model_udf.py +++ b/exasol_transformers_extension/udfs/models/base_model_udf.py @@ -1,33 +1,44 @@ +import os from abc import abstractmethod, ABC from typing import Iterator, List, Any import torch import traceback import pandas as pd import numpy as np +import transformers + from exasol_transformers_extension.deployment import constants from exasol_transformers_extension.utils import device_management, \ bucketfs_operations, dataframe_operations -from exasol_transformers_extension.utils.load_model import LoadModel +from exasol_transformers_extension.utils.load_local_model import LoadLocalModel +from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol class BaseModelUDF(ABC): """ This base class should be extended by each UDF class containing model logic. - This class contains common operations for all prediction UDFs. The following - methods should be implemented specifically for each UDF class: + This class contains common operations for all prediction UDFs: - accesses data part-by-part based on predefined batch size - - manages the script cache + - manages the model cache - reads the corresponding model from BucketFS into cache - creates model pipeline through transformer api - manages the creation of predictions and the preparation of results. + + Additionally, the following + methods should be implemented specifically for each UDF class: + - create_dataframes_from_predictions + - extract_unique_param_based_dataframes + - execute_prediction + - append_predictions_to_input_dataframe + """ def __init__(self, exa, - batch_size, - pipeline, - base_model, - tokenizer, - task_name): + batch_size: int, + pipeline: transformers.Pipeline, + base_model: ModelFactoryProtocol, + tokenizer: ModelFactoryProtocol, + task_name: str): self.exa = exa self.batch_size = batch_size self.pipeline = pipeline @@ -59,11 +70,11 @@ def create_model_loader(self): """ Creates the model_loader. """ - self.model_loader = LoadModel(self.pipeline, - self.base_model, - self.tokenizer, - self.task_name, - self.device) + self.model_loader = LoadLocalModel(pipeline_factory=self.pipeline, + base_model_factory=self.base_model, + tokenizer_factory=self.tokenizer, + task_name=self.task_name, + device=self.device) def get_predictions_from_batch(self, batch_df: pd.DataFrame) -> pd.DataFrame: """ @@ -180,17 +191,11 @@ def check_cache(self, model_df: pd.DataFrame) -> None: token_conn = model_df["token_conn"].iloc[0] current_model_key = (bucketfs_conn, sub_dir, model_name, token_conn) - if self.model_loader.last_loaded_model_key != current_model_key: + if self.model_loader.loaded_model_key != current_model_key: self.set_cache_dir(model_name, bucketfs_conn, sub_dir) self.model_loader.clear_device_memory() - if token_conn: - token_conn_obj = self.exa.get_connection(token_conn) - else: - token_conn_obj = None - self.last_created_pipeline = self.model_loader.load_models(model_name, - current_model_key, - self.cache_dir, - token_conn_obj) + self.last_created_pipeline = self.model_loader.load_models(self.cache_dir, + current_model_key) def set_cache_dir( self, model_name: str, bucketfs_conn_name: str, @@ -206,11 +211,10 @@ def set_cache_dir( bucketfs_operations.create_bucketfs_location_from_conn_object( self.exa.get_connection(bucketfs_conn_name)) - model_path = bucketfs_operations.get_model_path(sub_dir, model_name) + model_path = bucketfs_operations.get_model_path_with_pretrained(sub_dir, model_name) self.cache_dir = bucketfs_operations.get_local_bucketfs_path( bucketfs_location=bucketfs_location, model_path=str(model_path)) - def get_prediction(self, model_df: pd.DataFrame) -> pd.DataFrame: """ Perform prediction of the given model and preparation of the prediction diff --git a/exasol_transformers_extension/udfs/models/model_downloader_udf.py b/exasol_transformers_extension/udfs/models/model_downloader_udf.py index 63430a8f..49e4bea0 100644 --- a/exasol_transformers_extension/udfs/models/model_downloader_udf.py +++ b/exasol_transformers_extension/udfs/models/model_downloader_udf.py @@ -4,17 +4,29 @@ from exasol_bucketfs_utils_python.bucketfs_factory import BucketFSFactory from exasol_transformers_extension.utils import bucketfs_operations -from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer import ModelFactoryProtocol, \ - HuggingFaceHubBucketFSModelTransferFactory +from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol +from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import \ + HuggingFaceHubBucketFSModelTransferSPFactory class ModelDownloaderUDF: + """ + UDF which downloads a pretrained model from Huggingface using Huggingface's transformers API, + and uploads it to the BucketFS, from where it can then be loaded without accessing Huggingface again. + Must be called with the following Input Parameter: + + model_name | sub_dir | bfs_conn | token_conn + --------------------------------------------------------------------------------------------------- + name of Huggingface model | directory to save model | BucketFS connection | name of token connection + + returns , + """ def __init__(self, exa, base_model_factory: ModelFactoryProtocol = transformers.AutoModel, tokenizer_factory: ModelFactoryProtocol = transformers.AutoTokenizer, - huggingface_hub_bucketfs_model_transfer: HuggingFaceHubBucketFSModelTransferFactory = - HuggingFaceHubBucketFSModelTransferFactory(), + huggingface_hub_bucketfs_model_transfer: HuggingFaceHubBucketFSModelTransferSPFactory = + HuggingFaceHubBucketFSModelTransferSPFactory(), bucketfs_factory: BucketFSFactory = BucketFSFactory()): self._exa = exa self._base_model_factory = base_model_factory @@ -31,10 +43,10 @@ def run(self, ctx) -> None: def _download_model(self, ctx) -> Tuple[str, str]: # parameters - model_name = ctx.model_name - sub_dir = ctx.sub_dir - bfs_conn = ctx.bfs_conn - token_conn = ctx.token_conn + model_name = ctx.model_name # name of Huggingface model + sub_dir = ctx.sub_dir # directory to save model + bfs_conn = ctx.bfs_conn # BucketFS connection + token_conn = ctx.token_conn # name of token connection # extract token from the connection if token connection name is given. # note that, token is required for private models. It doesn't matter @@ -64,6 +76,7 @@ def _download_model(self, ctx) -> Tuple[str, str]: ) as downloader: for model in [self._base_model_factory, self._tokenizer_factory]: downloader.download_from_huggingface_hub(model) + # upload model files to BucketFS model_tar_file_path = downloader.upload_to_bucketfs() return str(model_path), str(model_tar_file_path) diff --git a/exasol_transformers_extension/upload_model.py b/exasol_transformers_extension/upload_model.py index f6f6f27a..20207970 100644 --- a/exasol_transformers_extension/upload_model.py +++ b/exasol_transformers_extension/upload_model.py @@ -33,13 +33,18 @@ def main( model_name: str, sub_dir: str, local_model_path: str): + """ + Script for uploading locally saved model files to BucketFS. Files should have been saved locally + using Transformers save_pretrained function. This ensures proper loading from the BucketFS later + """ # create bucketfs location bucketfs_location = bucketfs_operations.create_bucketfs_location( bucketfs_name, bucketfs_host, bucketfs_port, bucketfs_use_https, bucketfs_user, bucketfs_password, bucket, path_in_bucket) # upload the downloaded model files into bucketfs - upload_path = bucketfs_operations.get_model_path(sub_dir, model_name) + upload_path = bucketfs_operations.get_model_path_with_pretrained(sub_dir, model_name) + bucketfs_operations.upload_model_files_to_bucketfs( local_model_path, upload_path, bucketfs_location) diff --git a/exasol_transformers_extension/utils/bucketfs_model_uploader.py b/exasol_transformers_extension/utils/bucketfs_model_uploader.py index 1ce3d499..a5310975 100644 --- a/exasol_transformers_extension/utils/bucketfs_model_uploader.py +++ b/exasol_transformers_extension/utils/bucketfs_model_uploader.py @@ -1,5 +1,6 @@ from pathlib import Path +from exasol_bucketfs_utils_python.abstract_bucketfs_location import AbstractBucketFSLocation from exasol_bucketfs_utils_python.bucketfs_location import BucketFSLocation from exasol_transformers_extension.utils import bucketfs_operations diff --git a/exasol_transformers_extension/utils/bucketfs_operations.py b/exasol_transformers_extension/utils/bucketfs_operations.py index f11402a5..1ad96a59 100644 --- a/exasol_transformers_extension/utils/bucketfs_operations.py +++ b/exasol_transformers_extension/utils/bucketfs_operations.py @@ -41,10 +41,13 @@ def create_bucketfs_location( def upload_model_files_to_bucketfs( tmpdir_name: str, model_path: Path, bucketfs_location: AbstractBucketFSLocation) -> Path: + """ + uploads model in tmpdir_name to model_path in bucketfs_location + """ with tempfile.TemporaryFile() as fileobj: create_tar_of_directory(Path(tmpdir_name), fileobj) - model_tar_file = model_path.with_suffix(".tar.gz") - return upload_file_to_bucketfs_with_retry(bucketfs_location, fileobj, model_tar_file) + model_upload_tar_file_path = model_path.with_suffix(".tar.gz") + return upload_file_to_bucketfs_with_retry(bucketfs_location, fileobj, model_upload_tar_file_path) @retry(wait=wait_fixed(2), stop=stop_after_attempt(10)) @@ -69,4 +72,8 @@ def get_local_bucketfs_path( def get_model_path(sub_dir: str, model_name: str) -> Path: - return Path(sub_dir, model_name.replace('-', '_')) + return Path(sub_dir, model_name) + + +def get_model_path_with_pretrained(sub_dir: str, model_name: str) -> Path: + return Path(sub_dir, model_name, "pretrained" , model_name) diff --git a/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer.py b/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer.py deleted file mode 100644 index f09e17fc..00000000 --- a/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer.py +++ /dev/null @@ -1,61 +0,0 @@ -from pathlib import Path - - -from exasol_bucketfs_utils_python.bucketfs_location import BucketFSLocation - -from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol -from exasol_transformers_extension.utils.bucketfs_model_uploader import BucketFSModelUploaderFactory -from exasol_transformers_extension.utils.temporary_directory_factory import TemporaryDirectoryFactory - - -class HuggingFaceHubBucketFSModelTransfer: - - def __init__(self, - bucketfs_location: BucketFSLocation, - model_name: str, - model_path: Path, - token: str, - temporary_directory_factory: TemporaryDirectoryFactory = TemporaryDirectoryFactory(), - bucketfs_model_uploader_factory: BucketFSModelUploaderFactory = BucketFSModelUploaderFactory()): - self._token = token - self._model_name = model_name - self._temporary_directory_factory = temporary_directory_factory - self._bucketfs_model_uploader = bucketfs_model_uploader_factory.create( - model_path=model_path, - bucketfs_location=bucketfs_location) - self._tmpdir = temporary_directory_factory.create() - self._tmpdir_name = self._tmpdir.__enter__() - - def __enter__(self): - return self - - def __del__(self): - self._tmpdir.cleanup() - - def __exit__(self, exc_type, exc_val, exc_tb): - self._tmpdir.__exit__(exc_type, exc_val, exc_tb) - - def download_from_huggingface_hub(self, model_factory: ModelFactoryProtocol): - """ - Download a model from HuggingFace Hub into a temporary directory - """ - model_factory.from_pretrained(self._model_name, cache_dir=self._tmpdir_name, use_auth_token=self._token) - - def upload_to_bucketfs(self) -> Path: - """ - Upload the downloaded models into the BucketFS - """ - return self._bucketfs_model_uploader.upload_directory(self._tmpdir_name) - - -class HuggingFaceHubBucketFSModelTransferFactory: - - def create(self, - bucketfs_location: BucketFSLocation, - model_name: str, - model_path: Path, - token: str) -> HuggingFaceHubBucketFSModelTransfer: - return HuggingFaceHubBucketFSModelTransfer(bucketfs_location=bucketfs_location, - model_name=model_name, - model_path=model_path, - token=token) diff --git a/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py b/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py index db90a7eb..56c92738 100644 --- a/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py +++ b/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py @@ -1,5 +1,6 @@ from pathlib import Path +from exasol_bucketfs_utils_python.abstract_bucketfs_location import AbstractBucketFSLocation from exasol_bucketfs_utils_python.bucketfs_location import BucketFSLocation from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol @@ -7,18 +8,15 @@ from exasol_transformers_extension.utils.temporary_directory_factory import TemporaryDirectoryFactory - - - class HuggingFaceHubBucketFSModelTransferSP: """ - Class for downloading a model using the Huggingface Transformers API, and loading it into the BucketFS - using save_pretrained. + Class for downloading a model using the Huggingface Transformers API, saving it locally using + transformers save_pretrained, and loading the saved model files into the BucketFS. - :bucketfs_location: BucketFSLocation the model should be loaded to - :model_name: Name of the model to be downloaded using Huggingface Transformers API - :model_path: Path the model will be loaded into the BucketFS at - :token: Huggingface token, only needed for private models + :bucketfs_location: BucketFSLocation the model should be loaded to + :model_name: Name of the model to be downloaded using Huggingface Transformers API + :model_path: Path the model will be loaded into the BucketFS at + :token: Huggingface token, only needed for private models :temporary_directory_factory: Optional. Default is TemporaryDirectoryFactory. Mainly change for testing. :bucketfs_model_uploader_factory: Optional. Default is BucketFSModelUploaderFactory. Mainly change for testing. """ @@ -50,9 +48,10 @@ def __exit__(self, exc_type, exc_val, exc_tb): def download_from_huggingface_hub(self, model_factory: ModelFactoryProtocol): """ Download a model from HuggingFace Hub into a temporary directory and save it with save_pretrained - in temporary directory / pretrained . + in temporary directory / pretrained / model_name. """ - model = model_factory.from_pretrained(self._model_name, cache_dir=self._tmpdir_name / "cache", use_auth_token=self._token) + model = model_factory.from_pretrained(self._model_name, cache_dir=self._tmpdir_name / "cache", + use_auth_token=self._token) model.save_pretrained(self._tmpdir_name / "pretrained" / self._model_name) def upload_to_bucketfs(self) -> Path: @@ -61,7 +60,7 @@ def upload_to_bucketfs(self) -> Path: returns: Path of the uploaded model in the BucketFS """ - return self._bucketfs_model_uploader.upload_directory(self._tmpdir_name / "pretrained" / self._model_name) + return self._bucketfs_model_uploader.upload_directory(self._tmpdir_name / "pretrained" / self._model_name) #todo should we do replace(-,_) here to? class HuggingFaceHubBucketFSModelTransferSPFactory: diff --git a/exasol_transformers_extension/utils/load_local_model.py b/exasol_transformers_extension/utils/load_local_model.py index c1e10a28..9a8b27ea 100644 --- a/exasol_transformers_extension/utils/load_local_model.py +++ b/exasol_transformers_extension/utils/load_local_model.py @@ -10,19 +10,19 @@ class LoadLocalModel: Class for loading locally saved models and tokenizers. Also stores information regarding the model and pipeline. :_pipeline_factory: a function to create a transformers pipeline - :task_name: name of the current task - :device: device to be used for pipeline creation + :task_name: name of the current task + :device: device to be used for pipeline creation, i.e "CPU" :_base_model_factory: a ModelFactoryProtocol for creating the loaded model :_tokenizer_factory: a ModelFactoryProtocol for creating the loaded tokenizer """ def __init__(self, - _pipeline_factory, + pipeline_factory, task_name: str, device: str, base_model_factory: ModelFactoryProtocol, tokenizer_factory: ModelFactoryProtocol ): - self.pipeline_factory = _pipeline_factory + self.pipeline_factory = pipeline_factory self.task_name = task_name self.device = device self._base_model_factory = base_model_factory @@ -39,7 +39,7 @@ def load_models(self, current_model_key: str ) -> transformers.pipelines.Pipeline: """ - Loads a locally saved model and tokenizer from "cache_dir / "pretrained" / model_name". + Loads a locally saved model and tokenizer from model_path. Returns new pipeline corresponding to the model and task. :model_path: location of the saved model and tokenizer diff --git a/exasol_transformers_extension/utils/load_model.py b/exasol_transformers_extension/utils/load_model.py deleted file mode 100644 index 74c5b881..00000000 --- a/exasol_transformers_extension/utils/load_model.py +++ /dev/null @@ -1,56 +0,0 @@ -import torch - -class LoadModel: - def __init__(self, - pipeline, - base_model, - tokenizer, - task_name, - device - ): - self.pipeline = pipeline - self.base_model = base_model - self.tokenizer = tokenizer - self.task_name = task_name - self.device = device - self.last_loaded_model = None - self.last_loaded_tokenizer = None - self.last_loaded_model_key = None - - def load_models(self, model_name: str, - current_model_key, - cache_dir, - token_conn_obj) -> None: - """ - Load model and tokenizer model from the cached location in bucketfs. - If the desired model is not cached, this method will attempt to - download the model to the read-only path /bucket/.. and cause an error. - This error will be addressed in ticket - https://github.com/exasol/transformers-extension/issues/43. - - :param model_name: The model name to be loaded - """ - token = False - if token_conn_obj: - token = token_conn_obj.password - - self.last_loaded_model = self.base_model.from_pretrained( - model_name, cache_dir=cache_dir, use_auth_token=token) - self.last_loaded_tokenizer = self.tokenizer.from_pretrained( - model_name, cache_dir=cache_dir, use_auth_token=token) - last_created_pipeline = self.pipeline( - self.task_name, - model=self.last_loaded_model, - tokenizer=self.last_loaded_tokenizer, - device=self.device, - framework="pt") - self.last_loaded_model_key = current_model_key - return last_created_pipeline - - def clear_device_memory(self): - """ - Delete models and free device memory - """ - self.last_loaded_model = None - self.last_loaded_tokenizer = None - torch.cuda.empty_cache() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 7861507c..2929638a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "exasol-transformers-extension" -version = "0.10.0" +version = "1.0.0" description = "An Exasol extension to use state-of-the-art pretrained machine learning models via the transformers api." authors = [ diff --git a/tests/fixtures/model_fixture.py b/tests/fixtures/model_fixture.py index 1d3b6892..6a7f635a 100644 --- a/tests/fixtures/model_fixture.py +++ b/tests/fixtures/model_fixture.py @@ -11,15 +11,17 @@ AbstractBucketFSLocation -def download_model(model_name: str, tmpdir_name: Path) -> None: - for downloader in [transformers.AutoModel, transformers.AutoTokenizer]: - downloader.from_pretrained(model_name, cache_dir=tmpdir_name) +def download_model(model_name: str, tmpdir_name: Path) -> Path: + tmpdir_name = Path(tmpdir_name) + for model_factory in [transformers.AutoModel, transformers.AutoTokenizer]: + model = model_factory.from_pretrained(model_name, cache_dir=tmpdir_name / "cache" / model_name) + model.save_pretrained(tmpdir_name / "pretrained" / model_name) + return tmpdir_name / "pretrained" / model_name @contextmanager def upload_model(bucketfs_location: AbstractBucketFSLocation, model_name: str, model_dir: Path) -> Path: - model_path = bucketfs_operations.get_model_path( model_params.sub_dir, model_name) bucketfs_operations.upload_model_files_to_bucketfs( @@ -29,33 +31,29 @@ def upload_model(bucketfs_location: AbstractBucketFSLocation, yield model_path -@contextmanager -def upload_model_to_local_bucketfs( - model_name: str, download_tmpdir: Path) -> str: +def generate_local_bucketfs_path_for_model(tmpdir: Path, model: str): + return tmpdir / model_params.sub_dir / model - download_model(model_name, download_tmpdir) - upload_tmpdir_name = Path(download_tmpdir, "upload_tmpdir") - upload_tmpdir_name.mkdir(parents=True, exist_ok=True) - bucketfs_location = LocalFSMockBucketFSLocation( - PurePosixPath(upload_tmpdir_name)) - upload_model(bucketfs_location, model_name, download_tmpdir) - yield upload_tmpdir_name + +def prepare_model_for_local_bucketfs(model: str, tmpdir_factory): + tmpdir = tmpdir_factory.mktemp(model) + bucketfs_path_for_model = generate_local_bucketfs_path_for_model(tmpdir, model) + download_model(model, bucketfs_path_for_model) + return tmpdir @pytest.fixture(scope="session") -def upload_base_model_to_local_bucketfs(tmpdir_factory) -> PurePosixPath: - tmpdir = tmpdir_factory.mktemp(model_params.base_model) - with upload_model_to_local_bucketfs( - model_params.base_model, tmpdir) as path: - yield path +def prepare_base_model_for_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model = model_params.base_model + bucketfs_path = prepare_model_for_local_bucketfs(model, tmpdir_factory) + yield bucketfs_path @pytest.fixture(scope="session") -def upload_seq2seq_model_to_local_bucketfs(tmpdir_factory) -> PurePosixPath: - tmpdir = tmpdir_factory.mktemp(model_params.seq2seq_model) - with upload_model_to_local_bucketfs( - model_params.seq2seq_model, tmpdir) as path: - yield path +def prepare_seq2seq_model_in_local_bucketfs(tmpdir_factory) -> PurePosixPath: + model = model_params.seq2seq_model + bucketfs_path = prepare_model_for_local_bucketfs(model, tmpdir_factory) + yield bucketfs_path @contextmanager @@ -63,7 +61,6 @@ def upload_model_to_bucketfs( model_name: str, download_tmpdir: Path, bucketfs_location: AbstractBucketFSLocation) -> str: - download_model(model_name, download_tmpdir) with upload_model( bucketfs_location, model_name, download_tmpdir) as model_path: diff --git a/tests/fixtures/setup_database_fixture.py b/tests/fixtures/setup_database_fixture.py index 6b93db21..d21a77b3 100644 --- a/tests/fixtures/setup_database_fixture.py +++ b/tests/fixtures/setup_database_fixture.py @@ -8,6 +8,7 @@ from exasol_transformers_extension.deployment.scripts_deployer import \ ScriptsDeployer from tests.utils.parameters import bucketfs_params +from tests.fixtures.language_container_fixture import language_alias bucketfs_connection_name = "TEST_TE_BFS_CONNECTION" schema_name = "TEST_INTEGRATION" diff --git a/tests/integration_tests/with_db/deployment/test_language_container_deployer_cli.py b/tests/integration_tests/with_db/deployment/test_language_container_deployer_cli.py index c6b229a5..6547fb96 100644 --- a/tests/integration_tests/with_db/deployment/test_language_container_deployer_cli.py +++ b/tests/integration_tests/with_db/deployment/test_language_container_deployer_cli.py @@ -125,7 +125,7 @@ def test_language_container_deployer_cli_by_downloading_container( schema = test_name language_alias = f"PYTHON3_TE_{test_name.upper()}" container_path = None - version = "0.5.0" + version = "0.9.0" create_schema(pyexasol_connection, schema) dsn = f"{exasol_config.host}:{exasol_config.port}" with revert_language_settings(pyexasol_connection): diff --git a/tests/integration_tests/with_db/test_upload_model.py b/tests/integration_tests/with_db/test_upload_model.py index 52c9a81e..9fa4ca30 100644 --- a/tests/integration_tests/with_db/test_upload_model.py +++ b/tests/integration_tests/with_db/test_upload_model.py @@ -12,14 +12,7 @@ from tests.integration_tests.with_db.udfs.python_rows_to_sql import python_rows_to_sql from tests.utils import postprocessing from tests.utils.parameters import bucketfs_params, model_params - - -@pytest.fixture(scope='function') -def download_sample_models(tmp_path) -> Path: - for downloader in [transformers.AutoModel, transformers.AutoTokenizer]: - downloader.from_pretrained(model_params.base_model, cache_dir=tmp_path) - - yield tmp_path, model_params.base_model +from tests.fixtures.model_fixture import download_model def adapt_file_to_upload(path: PosixPath, download_path: PosixPath): @@ -33,11 +26,12 @@ def adapt_file_to_upload(path: PosixPath, download_path: PosixPath): return PosixPath(path) -def test_model_upload(setup_database, pyexasol_connection, download_sample_models: Path, +def test_model_upload(setup_database, pyexasol_connection, tmp_path: Path, bucketfs_location: BucketFSLocation, bucketfs_config: config.BucketFs): sub_dir = 'sub_dir' - download_path, model_name = download_sample_models - upload_path = bucketfs_operations.get_model_path( + model_name = model_params.base_model + download_path = download_model(model_name, tmp_path) + upload_path = bucketfs_operations.get_model_path_with_pretrained( sub_dir, model_name) parsed_url = urlparse(bucketfs_config.url) host = parsed_url.netloc.split(":")[0] diff --git a/tests/integration_tests/without_db/udfs/test_filling_mask_udf.py b/tests/integration_tests/without_db/udfs/test_filling_mask_udf.py index 3bbbb469..cbcd849c 100644 --- a/tests/integration_tests/without_db/udfs/test_filling_mask_udf.py +++ b/tests/integration_tests/without_db/udfs/test_filling_mask_udf.py @@ -1,3 +1,4 @@ +import os from typing import Dict import pandas as pd @@ -10,6 +11,7 @@ from tests.integration_tests.without_db.udfs.matcher import Result, ScoreMatcher, RankDTypeMatcher, ShapeMatcher, \ NoErrorMessageMatcher, NewColumnsEmptyMatcher, ErrorMessageMatcher, RankMonotonicMatcher, ColumnsMatcher from tests.utils.parameters import model_params +from tests.fixtures.model_fixture import prepare_base_model_for_local_bucketfs class ExaEnvironment: @@ -52,12 +54,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU with single input", 0, 1) ]) def test_filling_mask_udf( - description, device_id, n_rows, upload_base_model_to_local_bucketfs): + description, device_id, n_rows, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") @@ -111,12 +113,12 @@ def test_filling_mask_udf( ("on GPU with single input", 0, 1) ]) def test_filling_mask_udf_on_error_handling( - description, device_id, n_rows, upload_base_model_to_local_bucketfs): + description, device_id, n_rows, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_question_answering_udf.py b/tests/integration_tests/without_db/udfs/test_question_answering_udf.py index 8fafd71f..2f0acdcf 100644 --- a/tests/integration_tests/without_db/udfs/test_question_answering_udf.py +++ b/tests/integration_tests/without_db/udfs/test_question_answering_udf.py @@ -56,12 +56,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ]) def test_question_answering_udf( description, device_id, n_rows, - top_k, upload_base_model_to_local_bucketfs): + top_k, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") @@ -122,12 +122,12 @@ def test_question_answering_udf( ]) def test_question_answering_udf_on_error_handling( description, device_id, n_rows, - top_k, upload_base_model_to_local_bucketfs): + top_k, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py b/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py index 6823f669..43d9445a 100644 --- a/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py +++ b/tests/integration_tests/without_db/udfs/test_sequence_classification_single_text_udf.py @@ -48,12 +48,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU", 0) ]) def test_sequence_classification_single_text_udf( - description, device_id, upload_base_model_to_local_bucketfs): + description, device_id, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") @@ -106,12 +106,12 @@ def test_sequence_classification_single_text_udf( ("on GPU", 0) ]) def test_sequence_classification_single_text_udf_on_error_handling( - description, device_id, upload_base_model_to_local_bucketfs): + description, device_id, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py b/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py index a986dc2d..e3669f26 100644 --- a/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py +++ b/tests/integration_tests/without_db/udfs/test_sequence_classification_text_pair_udf.py @@ -48,12 +48,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU", 0) ]) def test_sequence_classification_text_pair_udf( - description, device_id, upload_base_model_to_local_bucketfs): + description, device_id, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") @@ -108,12 +108,12 @@ def test_sequence_classification_text_pair_udf( ("on GPU", 0) ]) def test_sequence_classification_text_pair_udf_on_error_handling( - description, device_id, upload_base_model_to_local_bucketfs): + description, device_id, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_text_generation_udf.py b/tests/integration_tests/without_db/udfs/test_text_generation_udf.py index 38f9c977..3eabcf2a 100644 --- a/tests/integration_tests/without_db/udfs/test_text_generation_udf.py +++ b/tests/integration_tests/without_db/udfs/test_text_generation_udf.py @@ -52,12 +52,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU with single input", 0, 1) ]) def test_text_generation_udf( - description, device_id, n_rows, upload_base_model_to_local_bucketfs): + description, device_id, n_rows, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") @@ -112,12 +112,12 @@ def test_text_generation_udf( ("on GPU with single input", 0, 1) ]) def test_text_generation_udf_on_error_handlig( - description, device_id, n_rows, upload_base_model_to_local_bucketfs): + description, device_id, n_rows, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_token_classification_udf.py b/tests/integration_tests/without_db/udfs/test_token_classification_udf.py index a1e7c012..b6129ced 100644 --- a/tests/integration_tests/without_db/udfs/test_token_classification_udf.py +++ b/tests/integration_tests/without_db/udfs/test_token_classification_udf.py @@ -60,12 +60,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ]) def test_token_classification_udf( description, device_id, n_rows, agg, - upload_base_model_to_local_bucketfs): + prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") @@ -113,12 +113,12 @@ def test_token_classification_udf( ("on GPU", 0) ]) def test_token_classification_udf_with_multiple_aggregation_strategies( - description, device_id, upload_base_model_to_local_bucketfs): + description, device_id, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") @@ -155,8 +155,7 @@ def test_token_classification_udf_with_multiple_aggregation_strategies( ['start_pos', 'end_pos', 'word', 'entity', 'score', 'error_message'] result = Result(result_df) - assert ( - result == ColumnsMatcher(columns=columns[1:], new_columns=new_columns) + assert (result == ColumnsMatcher(columns=columns[1:], new_columns=new_columns) and result == NoErrorMessageMatcher() and set(result_df['aggregation_strategy'].unique()) == {"none", "simple", "max", "average"} ) @@ -179,12 +178,12 @@ def test_token_classification_udf_with_multiple_aggregation_strategies( ]) def test_token_classification_udf_on_error_handling( description, device_id, n_rows, agg, - upload_base_model_to_local_bucketfs): + prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_translation_udf.py b/tests/integration_tests/without_db/udfs/test_translation_udf.py index f36e89d8..fb148c42 100644 --- a/tests/integration_tests/without_db/udfs/test_translation_udf.py +++ b/tests/integration_tests/without_db/udfs/test_translation_udf.py @@ -1,14 +1,16 @@ -import tempfile import torch -import pytest -import pandas as pd from typing import Dict + +import pandas as pd +import pytest +import torch +from exasol_udf_mock_python.connection import Connection + from exasol_transformers_extension.udfs.models.translation_udf import \ TranslationUDF -from tests.integration_tests.without_db.udfs.matcher import Result, ScoreMatcher, ShapeMatcher, NoErrorMessageMatcher, \ +from tests.integration_tests.without_db.udfs.matcher import Result, ShapeMatcher, NoErrorMessageMatcher, \ NewColumnsEmptyMatcher, ErrorMessageMatcher from tests.utils.parameters import model_params -from exasol_udf_mock_python.connection import Connection class ExaEnvironment: @@ -62,12 +64,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ]) def test_translation_udf( description, device_id, languages, - upload_seq2seq_model_to_local_bucketfs): + prepare_seq2seq_model_in_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_seq2seq_model_to_local_bucketfs + bucketfs_base_path = prepare_seq2seq_model_in_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") @@ -132,12 +134,12 @@ def test_translation_udf( ]) def test_translation_udf_on_error_handling( description, device_id, languages, - upload_seq2seq_model_to_local_bucketfs): + prepare_seq2seq_model_in_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_seq2seq_model_to_local_bucketfs + bucketfs_base_path = prepare_seq2seq_model_in_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py b/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py index 55bea11e..22d2dd80 100644 --- a/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py +++ b/tests/integration_tests/without_db/udfs/test_zero_shot_text_classification_udf.py @@ -49,12 +49,12 @@ def get_dataframe(self, num_rows='all', start_col=0): ("on GPU", 0) ]) def test_sequence_classification_single_text_udf( - description, device_id, upload_base_model_to_local_bucketfs): + description, device_id, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") @@ -114,12 +114,12 @@ def test_sequence_classification_single_text_udf( ("on GPU", 0) ]) def test_sequence_classification_single_text_udf_on_error_handling( - description, device_id, upload_base_model_to_local_bucketfs): + description, device_id, prepare_base_model_for_local_bucketfs): if device_id is not None and not torch.cuda.is_available(): pytest.skip(f"There is no available device({device_id}) " f"to execute the test") - bucketfs_base_path = upload_base_model_to_local_bucketfs + bucketfs_base_path = prepare_base_model_for_local_bucketfs bucketfs_conn_name = "bucketfs_connection" bucketfs_connection = Connection(address=f"file://{bucketfs_base_path}") diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_multiple_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_multiple_model_multiple_batch.py index cd53dfa6..688cb17d 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_multiple_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_multiple_model_multiple_batch.py @@ -53,9 +53,9 @@ class ErrorNotCachedMultipleModelMultipleBatch: "token_conn1": Connection(address='', password="token1") } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir2, "sub_dir2", "model2"), "token1"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2", "pretrained", "model2"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_single_model_multiple_batch.py index 3636de1f..9e741c32 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/error_not_cached_single_model_multiple_batch.py @@ -48,7 +48,7 @@ class ErrorNotCachedSingleModelMultipleBatch: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_multiple_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_multiple_model_multiple_batch.py index 8cb6093f..4aa8556e 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_multiple_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_multiple_model_multiple_batch.py @@ -52,9 +52,9 @@ class ErrorOnPredictionMultipleModelMultipleBatch: "token_conn1": Connection(address='', password="token1") } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir2, "sub_dir2", "model2"), "token1"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2", "pretrained", "model2"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_single_model_multiple_batch.py index 1368ef98..01a8e81f 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/error_on_prediction_single_model_multiple_batch.py @@ -47,7 +47,7 @@ class ErrorOnPredictionSingleModelMultipleBatch: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/mock_filling_mask.py b/tests/unit_tests/udf_wrapper_params/filling_mask/mock_filling_mask.py index 35d84fa5..9d9815d0 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/mock_filling_mask.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/mock_filling_mask.py @@ -13,12 +13,13 @@ def to(self, device): class MockFillingMaskFactory: - def __init__(self, mock_models: Dict[Tuple[PurePosixPath, str], MockFillingMaskModel]): + def __init__(self, mock_models: Dict[PurePosixPath, MockFillingMaskModel]): self.mock_models = mock_models - def from_pretrained(self, model_name, cache_dir, use_auth_token): + + def from_pretrained(self, model_path): # the cache_dir path already has model_name - return self.mock_models[(cache_dir, use_auth_token)] + return self.mock_models[PurePosixPath(model_path)] class MockPipeline: diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/mock_sequence_tokenizer.py b/tests/unit_tests/udf_wrapper_params/filling_mask/mock_sequence_tokenizer.py index 89605ede..2867e658 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/mock_sequence_tokenizer.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/mock_sequence_tokenizer.py @@ -3,7 +3,7 @@ class MockSequenceTokenizer: @classmethod - def from_pretrained(cls, model_name, cache_dir, use_auth_token): + def from_pretrained(cls, model_path): cls.mask_token = "valid" return cls diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_multiple_batch.py index d0a34492..e39c6a41 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_multiple_batch.py @@ -52,9 +52,9 @@ class MultipleBucketFSConnSingleSubdirSingleModelNameMultipleBatch: "token_conn1": Connection(address='', password="token1") } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir2, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir2, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_single_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_single_batch.py index 6bece9ea..f19b5849 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_single_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_bfsconn_single_subdir_single_model_single_batch.py @@ -52,9 +52,9 @@ class MultipleBucketFSConnSingleSubdirSingleModelNameSingleBatch: "token_conn1": Connection(address='', password="token1") } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir2, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir2, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_complete.py index 16cb3338..c67550e7 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_complete.py @@ -52,9 +52,9 @@ class MultipleModelMultipleBatchComplete: "token_conn1": Connection(address='', password="token1") } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir2, "sub_dir2", "model2"), "token1"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2", "pretrained", "model2"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_incomplete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_incomplete.py index d4885996..4c384503 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_incomplete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_incomplete.py @@ -53,9 +53,9 @@ class MultipleModelMultipleBatchIncomplete: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir2, "sub_dir2", "model2"), "token1"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2", "pretrained", "model2"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_multiple_models_per_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_multiple_models_per_batch.py index a9761963..6688261b 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_multiple_models_per_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_multiple_batch_multiple_models_per_batch.py @@ -64,13 +64,13 @@ class MultipleModelMultipleBatchMultipleModelsPerBatch: "token_conn1": Connection(address='', password="token1") } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir2, "sub_dir2", "model2"), "token1"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2", "pretrained", "model2"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1), - (PurePosixPath(base_cache_dir3, "sub_dir3", "model3"), "token1"): + PurePosixPath(base_cache_dir3, "sub_dir3", "model3", "pretrained", "model3"): MockFillingMaskModel(sequence="text valid 3", score=0.3, rank=1), - (PurePosixPath(base_cache_dir4, "sub_dir4", "model4"), "token1"): + PurePosixPath(base_cache_dir4, "sub_dir4", "model4", "pretrained", "model4"): MockFillingMaskModel(sequence="text valid 4", score=0.4, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_complete.py index 71ba12b2..f0d45baa 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_complete.py @@ -52,9 +52,9 @@ class MultipleModelSingleBatchComplete: "token_conn1": Connection(address='', password="token1") } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir2, "sub_dir2", "model2"), "token1"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2", "pretrained", "model2"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_incomplete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_incomplete.py index 7c848ffc..22232ed8 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_incomplete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_model_single_batch_incomplete.py @@ -52,9 +52,9 @@ class MultipleModelSingleBatchIncomplete: "token_conn1": Connection(address='', password="token1") } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir2, "sub_dir2", "model2"), "token1"): + PurePosixPath(base_cache_dir2, "sub_dir2", "model2", "pretrained", "model2"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_token_conn_single_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_token_conn_single_batch_complete.py index 5377a5cd..2afe155b 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_token_conn_single_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_token_conn_single_batch_complete.py @@ -52,9 +52,9 @@ class MultipleTokenConnSingleBatchComplete: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token2"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_multiple_batch.py index ca41a670..c987ea08 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_multiple_batch.py @@ -52,7 +52,7 @@ class MultipleTopkSingleModelNameMultipleBatch: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_single_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_single_batch.py index e01d2344..ef17ca38 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_single_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/multiple_topk_single_model_single_batch.py @@ -52,7 +52,7 @@ class MultipleTopkSingleModelNameSingleBatch: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_multiple_batch.py index 2d09a280..00e027c5 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_multiple_batch.py @@ -56,6 +56,12 @@ class SingleBucketFSConnMultipleSubdirSingleModelNameMultipleBatch: (PurePosixPath(base_cache_dir1, "sub_dir2", "model1"), "token1"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) + mock_factory = MockFillingMaskFactory({ + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): + MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), + PurePosixPath(base_cache_dir1, "sub_dir2", "model1", "pretrained", "model1"): + MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) + }) mock_pipeline = MockPipeline udf_wrapper = udf_wrapper diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_single_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_single_batch.py index f3cc3f67..926a4225 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_single_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_bfsconn_multiple_subdir_single_model_single_batch.py @@ -51,9 +51,9 @@ class SingleBucketFSConnMultipleSubdirSingleModelNameSingleBatch: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir1, "sub_dir2", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir2", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_complete.py index ce4a46ee..4e1c26fa 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_complete.py @@ -47,7 +47,7 @@ class SingleModelMultipleBatchComplete: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_incomplete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_incomplete.py index 75b24b60..abd43e67 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_incomplete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_multiple_batch_incomplete.py @@ -47,7 +47,7 @@ class SingleModelMultipleBatchIncomplete: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_complete.py index 5bd8e2f7..195bc49b 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_complete.py @@ -47,8 +47,8 @@ class SingleModelSingleBatchComplete: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): - MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): + MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) mock_pipeline = MockPipeline diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_incomplete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_incomplete.py index c90854eb..3704ed36 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_incomplete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_model_single_batch_incomplete.py @@ -47,7 +47,7 @@ class SingleModelSingleBatchIncomplete: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_multiple_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_multiple_batch.py index 795a3b27..759b37b3 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_multiple_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_multiple_batch.py @@ -51,9 +51,9 @@ class SingleTopkMultipleModelNameMultipleBatch: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir1, "sub_dir1", "model2"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model2", "pretrained", "model2"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_single_batch.py b/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_single_batch.py index 10683abc..df301162 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_single_batch.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/single_topk_multiple_model_single_batch.py @@ -51,9 +51,9 @@ class SingleTopkMultipleModelNameSingleBatch: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir1, "sub_dir1", "model2"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model2", "pretrained", "model2"): MockFillingMaskModel(sequence="text valid 2", score=0.2, rank=1) }) diff --git a/tests/unit_tests/udf_wrapper_params/filling_mask/token_conn_and_no_token_mixed_single_batch_complete.py b/tests/unit_tests/udf_wrapper_params/filling_mask/token_conn_and_no_token_mixed_single_batch_complete.py index e72b8808..3d6d8697 100644 --- a/tests/unit_tests/udf_wrapper_params/filling_mask/token_conn_and_no_token_mixed_single_batch_complete.py +++ b/tests/unit_tests/udf_wrapper_params/filling_mask/token_conn_and_no_token_mixed_single_batch_complete.py @@ -51,9 +51,9 @@ class TokenConnAndNoTokenMixedSingleBatchComplete: } mock_factory = MockFillingMaskFactory({ - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), "token1"): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1), - (PurePosixPath(base_cache_dir1, "sub_dir1", "model1"), False): + PurePosixPath(base_cache_dir1, "sub_dir1", "model1", "pretrained", "model1"): MockFillingMaskModel(sequence="text valid 1", score=0.1, rank=1) }) diff --git a/tests/unit_tests/udfs/test_base_udf.py b/tests/unit_tests/udfs/test_base_udf.py index c3dc932e..6ca972c5 100644 --- a/tests/unit_tests/udfs/test_base_udf.py +++ b/tests/unit_tests/udfs/test_base_udf.py @@ -9,8 +9,7 @@ from tests.unit_tests.utils_for_udf_tests import create_mock_exa_environment, create_mock_udf_context from tests.unit_tests.udfs.base_model_dummy_implementation import DummyImplementationUDF -from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer import ModelFactoryProtocol -from exasol_transformers_extension.utils.load_model import LoadModel +from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol from tests.utils.mock_cast import mock_cast from tests.unit_tests.udf_wrapper_params.zero_shot.mock_zero_shot import MockPipeline import re diff --git a/tests/unit_tests/udfs/test_model_downloader_udf.py b/tests/unit_tests/udfs/test_model_downloader_udf.py index 65295930..d5a7bb56 100644 --- a/tests/unit_tests/udfs/test_model_downloader_udf.py +++ b/tests/unit_tests/udfs/test_model_downloader_udf.py @@ -11,8 +11,9 @@ from tests.unit_tests.utils_for_udf_tests import create_mock_exa_environment, create_mock_udf_context from exasol_transformers_extension.udfs.models.model_downloader_udf import \ ModelDownloaderUDF -from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer import ModelFactoryProtocol, \ - HuggingFaceHubBucketFSModelTransferFactory, HuggingFaceHubBucketFSModelTransfer +from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol +from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import \ + HuggingFaceHubBucketFSModelTransferSPFactory, HuggingFaceHubBucketFSModelTransferSP from tests.utils.matchers import AnyOrder from tests.utils.mock_cast import mock_cast @@ -47,10 +48,10 @@ def udf_wrapper(): def test_model_downloader(description, count, token_conn_name, token_conn_obj, expected_token): mock_base_model_factory: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) mock_tokenizer_factory: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) - mock_model_downloader_factory: Union[HuggingFaceHubBucketFSModelTransferFactory, MagicMock] = create_autospec( - HuggingFaceHubBucketFSModelTransferFactory) - mock_model_downloaders: List[Union[HuggingFaceHubBucketFSModelTransfer, MagicMock]] = [ - create_autospec(HuggingFaceHubBucketFSModelTransfer) + mock_model_downloader_factory: Union[HuggingFaceHubBucketFSModelTransferSPFactory, MagicMock] = create_autospec( + HuggingFaceHubBucketFSModelTransferSPFactory) + mock_model_downloaders: List[Union[HuggingFaceHubBucketFSModelTransferSP, MagicMock]] = [ + create_autospec(HuggingFaceHubBucketFSModelTransferSP) for i in range(count)] for i in range(count): mock_cast(mock_model_downloaders[i].__enter__).side_effect = [mock_model_downloaders[i]] diff --git a/tests/unit_tests/utils/test_huggingface_hub_bucketfs_model_transfer.py b/tests/unit_tests/utils/test_huggingface_hub_bucketfs_model_transfer.py deleted file mode 100644 index 0e3c1499..00000000 --- a/tests/unit_tests/utils/test_huggingface_hub_bucketfs_model_transfer.py +++ /dev/null @@ -1,71 +0,0 @@ -from pathlib import Path -from typing import Union -from unittest.mock import create_autospec, MagicMock, call - -from exasol_bucketfs_utils_python.bucketfs_location import BucketFSLocation - -from exasol_transformers_extension.utils.bucketfs_model_uploader import BucketFSModelUploader, \ - BucketFSModelUploaderFactory -from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer import ModelFactoryProtocol, \ - HuggingFaceHubBucketFSModelTransfer -from exasol_transformers_extension.utils.temporary_directory_factory import TemporaryDirectoryFactory -from tests.utils.mock_cast import mock_cast - - -class TestSetup: - def __init__(self): - self.bucketfs_location_mock: Union[BucketFSLocation, MagicMock] = create_autospec(BucketFSLocation) - self.model_factory_mock: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol) - self.temporary_directory_factory_mock: Union[TemporaryDirectoryFactory, MagicMock] = \ - create_autospec(TemporaryDirectoryFactory) - self.bucketfs_model_uploader_factory_mock: Union[BucketFSModelUploaderFactory, MagicMock] = \ - create_autospec(BucketFSModelUploaderFactory) - self.bucketfs_model_uploader_mock: Union[BucketFSModelUploader, MagicMock] = \ - create_autospec(BucketFSModelUploader) - mock_cast(self.bucketfs_model_uploader_factory_mock.create).side_effect = [self.bucketfs_model_uploader_mock] - - self.token = "token" - self.model_name = "test_model_name" - self.model_path = Path("test_model_path") - self.downloader = HuggingFaceHubBucketFSModelTransfer( - bucketfs_location=self.bucketfs_location_mock, - model_path=self.model_path, - model_name=self.model_name, - token=self.token, - temporary_directory_factory=self.temporary_directory_factory_mock, - bucketfs_model_uploader_factory=self.bucketfs_model_uploader_factory_mock - ) - - def reset_mocks(self): - self.bucketfs_location_mock.reset_mock() - self.temporary_directory_factory_mock.reset_mock() - self.model_factory_mock.reset_mock() - self.bucketfs_model_uploader_mock.reset_mock() - - -def test_init(): - test_setup = TestSetup() - assert test_setup.temporary_directory_factory_mock.mock_calls == [call.create(), call.create().__enter__()] \ - and test_setup.model_factory_mock.mock_calls == [] \ - and test_setup.bucketfs_location_mock.mock_calls == [] \ - and mock_cast(test_setup.bucketfs_model_uploader_factory_mock.create).mock_calls == [ - call.create(model_path=test_setup.model_path, bucketfs_location=test_setup.bucketfs_location_mock) - ] - - -def test_download(): - test_setup = TestSetup() - test_setup.downloader.download_from_huggingface_hub(model_factory=test_setup.model_factory_mock) - cache_dir = test_setup.temporary_directory_factory_mock.create().__enter__() - assert test_setup.model_factory_mock.mock_calls == [ - call.from_pretrained(test_setup.model_name, cache_dir=cache_dir, - use_auth_token=test_setup.token)] - - -def test_upload(): - test_setup = TestSetup() - test_setup.downloader.download_from_huggingface_hub(model_factory=test_setup.model_factory_mock) - test_setup.reset_mocks() - test_setup.downloader.upload_to_bucketfs() - cache_dir = test_setup.temporary_directory_factory_mock.create().__enter__() - assert mock_cast(test_setup.bucketfs_model_uploader_mock.upload_directory).mock_calls == [call(cache_dir)]