exasol · MarleneKress79789 · Mar 7, 2024 · Jan 31, 2024 · Jan 31, 2024 · Feb 5, 2024
diff --git a/doc/changes/changelog.md b/doc/changes/changelog.md
@@ -1,5 +1,6 @@
 # Changelog
 
+* [0.10.0](changes_0.10.0.md)
 * [0.9.0](changes_0.9.0.md)
 * [0.8.0](changes_0.8.0.md)
 * [0.7.0](changes_0.7.0.md)

diff --git a/doc/changes/changes_0.10.0.md b/doc/changes/changes_0.10.0.md
@@ -0,0 +1,19 @@
+# Transformers Extension 0.10.0, T.B.D
+
+Code name: T.B.D
+
+
+## Summary
+T.B.D
+
+
+### Features
+
+- #146: Integrated new download and load functions using save_pretrained
+`
+### Refactorings
+
+
+### Security 
+
+
diff --git a/doc/user_guide/user_guide.md b/doc/user_guide/user_guide.md
@@ -229,9 +229,10 @@ Before you can use pre-trained models, the models must be stored in the
 BucketFS. We provide two different ways to load transformers models 
 into BucketFS:
 
+
 ### 1. Model Downloader UDF
 Using the `TE_MODEL_DOWNLOADER_UDF` below, you can download the desired model 
-from the huggingface hub and upload it to bucketfs.
+from the huggingface hub and upload it to BucketFS.
 
 ```sql
 SELECT TE_MODEL_DOWNLOADER_UDF(
@@ -274,6 +275,8 @@ models from the local filesystem into BucketFS:
   ```
 
 *Note*: The options --local-model-path needs to point to a path which contains the model and its tokenizer. 
+These should have been saved using transformers [save_pretrained](https://huggingface.co/docs/transformers/v4.32.1/en/installation#fetch-models-and-tokenizers-to-use-offline) 
+function to ensure proper loading by the Transformers Extension UDFs.
 
 ## Prediction UDFs
 We provided 7 prediction UDFs, each performing an NLP task through the [transformers API](https://huggingface.co/docs/transformers/task_summary). 

diff --git a/exasol_transformers_extension/udfs/models/base_model_udf.py b/exasol_transformers_extension/udfs/models/base_model_udf.py
@@ -1,33 +1,44 @@
+import os
 from abc import abstractmethod, ABC
 from typing import Iterator, List, Any
 import torch
 import traceback
 import pandas as pd
 import numpy as np
+import transformers
+
 from exasol_transformers_extension.deployment import constants
 from exasol_transformers_extension.utils import device_management, \
     bucketfs_operations, dataframe_operations
-from exasol_transformers_extension.utils.load_model import LoadModel
+from exasol_transformers_extension.utils.load_local_model import LoadLocalModel
+from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol
 
 
 class BaseModelUDF(ABC):
     """
     This base class should be extended by each UDF class containing model logic.
-    This class contains common operations for all prediction UDFs. The following
-    methods should be implemented specifically for each UDF class:
+    This class contains common operations for all prediction UDFs:
         - accesses data part-by-part based on predefined batch size
         - manages the script cache
         - reads the corresponding model from BucketFS into cache
         - creates model pipeline through transformer api
         - manages the creation of predictions and the preparation of results.
+
+    Additionally, the following
+    methods should be implemented specifically for each UDF class:
+        - create_dataframes_from_predictions
+        - extract_unique_param_based_dataframes
+        - execute_prediction
+        - append_predictions_to_input_dataframe
+
     """
     def __init__(self,
                  exa,
-                 batch_size,
-                 pipeline,
-                 base_model,
-                 tokenizer,
-                 task_name):
+                 batch_size: int,
+                 pipeline: transformers.Pipeline,
+                 base_model: ModelFactoryProtocol,
+                 tokenizer: ModelFactoryProtocol,
+                 task_name: str):
         self.exa = exa
         self.batch_size = batch_size
         self.pipeline = pipeline
@@ -59,11 +70,11 @@ def create_model_loader(self):
         """
         Creates the model_loader.
         """
-        self.model_loader = LoadModel(self.pipeline,
-                                      self.base_model,
-                                      self.tokenizer,
-                                      self.task_name,
-                                      self.device)
+        self.model_loader = LoadLocalModel(_pipeline_factory=self.pipeline,
+                                           base_model_factory=self.base_model,
+                                           tokenizer_factory=self.tokenizer,
+                                           task_name=self.task_name,
+                                           device=self.device)
 
     def get_predictions_from_batch(self, batch_df: pd.DataFrame) -> pd.DataFrame:
         """
@@ -180,17 +191,11 @@ def check_cache(self, model_df: pd.DataFrame) -> None:
         token_conn = model_df["token_conn"].iloc[0]
 
         current_model_key = (bucketfs_conn, sub_dir, model_name, token_conn)
-        if self.model_loader.last_loaded_model_key != current_model_key:
+        if self.model_loader.loaded_model_key != current_model_key:
             self.set_cache_dir(model_name, bucketfs_conn, sub_dir)
             self.model_loader.clear_device_memory()
-            if token_conn:
-                token_conn_obj = self.exa.get_connection(token_conn)
-            else:
-                token_conn_obj = None
-            self.last_created_pipeline = self.model_loader.load_models(model_name,
-                                                                       current_model_key,
-                                                                       self.cache_dir,
-                                                                       token_conn_obj)
+            self.last_created_pipeline = self.model_loader.load_models(self.cache_dir,
+                                                                       current_model_key)
 
     def set_cache_dir(
             self, model_name: str, bucketfs_conn_name: str,
@@ -206,11 +211,10 @@ def set_cache_dir(
             bucketfs_operations.create_bucketfs_location_from_conn_object(
                 self.exa.get_connection(bucketfs_conn_name))
 
-        model_path = bucketfs_operations.get_model_path(sub_dir, model_name)
+        model_path = bucketfs_operations.get_model_path_with_pretrained(sub_dir, model_name)
         self.cache_dir = bucketfs_operations.get_local_bucketfs_path(
             bucketfs_location=bucketfs_location, model_path=str(model_path))
 
-
     def get_prediction(self, model_df: pd.DataFrame) -> pd.DataFrame:
         """
         Perform prediction of the given model and preparation of the prediction

diff --git a/exasol_transformers_extension/udfs/models/model_downloader_udf.py b/exasol_transformers_extension/udfs/models/model_downloader_udf.py
@@ -4,17 +4,29 @@
 from exasol_bucketfs_utils_python.bucketfs_factory import BucketFSFactory
 
 from exasol_transformers_extension.utils import bucketfs_operations
-from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer import ModelFactoryProtocol, \
-    HuggingFaceHubBucketFSModelTransferFactory
+from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol
+from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import \
+    HuggingFaceHubBucketFSModelTransferSPFactory
 
 
 class ModelDownloaderUDF:
+    """
+    UDF which downloads a pretrained model from Huggingface using Huggingface's transformers API,
+    and uploads it to the BucketFS at , from where it can then be loaded without accessing Huggingface again.
+    Must be called with the following Input Table:
+
+    model_name                | sub_dir                 | bfs_conn            | token_conn
+    ---------------------------------------------------------------------------------------------------
+    name of Huggingface model | directory to save model | BucketFS connection | name of token connection
+
+    returns <sub_dir/model_name> , <path of model BucketFS>
+    """
     def __init__(self,
                  exa,
                  base_model_factory: ModelFactoryProtocol = transformers.AutoModel,
                  tokenizer_factory: ModelFactoryProtocol = transformers.AutoTokenizer,
-                 huggingface_hub_bucketfs_model_transfer: HuggingFaceHubBucketFSModelTransferFactory =
-                 HuggingFaceHubBucketFSModelTransferFactory(),
+                 huggingface_hub_bucketfs_model_transfer: HuggingFaceHubBucketFSModelTransferSPFactory =
+                 HuggingFaceHubBucketFSModelTransferSPFactory(),
                  bucketfs_factory: BucketFSFactory = BucketFSFactory()):
         self._exa = exa
         self._base_model_factory = base_model_factory
@@ -31,10 +43,10 @@ def run(self, ctx) -> None:
 
     def _download_model(self, ctx) -> Tuple[str, str]:
         # parameters
-        model_name = ctx.model_name
-        sub_dir = ctx.sub_dir
-        bfs_conn = ctx.bfs_conn
-        token_conn = ctx.token_conn
+        model_name = ctx.model_name     # name of Huggingface model
+        sub_dir = ctx.sub_dir           # directory to save model
+        bfs_conn = ctx.bfs_conn         # BucketFS connection
+        token_conn = ctx.token_conn     # name of token connection
 
         # extract token from the connection if token connection name is given.
         # note that, token is required for private models. It doesn't matter
@@ -64,6 +76,7 @@ def _download_model(self, ctx) -> Tuple[str, str]:
         ) as downloader:
             for model in [self._base_model_factory, self._tokenizer_factory]:
                 downloader.download_from_huggingface_hub(model)
+            # upload model files to BucketFS
             model_tar_file_path = downloader.upload_to_bucketfs()
 
         return str(model_path), str(model_tar_file_path)
diff --git a/exasol_transformers_extension/upload_model.py b/exasol_transformers_extension/upload_model.py
@@ -33,13 +33,18 @@ def main(
         model_name: str,
         sub_dir: str,
         local_model_path: str):
+    """
+    Script for uploading locally saved model files to BucketFS. Files should have been saved locally
+    using Transformers save_pretrained function. This ensures proper loading from the BucketFS later
+    """
     # create bucketfs location
     bucketfs_location = bucketfs_operations.create_bucketfs_location(
         bucketfs_name, bucketfs_host, bucketfs_port, bucketfs_use_https,
         bucketfs_user, bucketfs_password, bucket, path_in_bucket)
 
     # upload the downloaded model files into bucketfs
-    upload_path = bucketfs_operations.get_model_path(sub_dir, model_name)
+    upload_path = bucketfs_operations.get_model_path_with_pretrained(sub_dir, model_name)
+
     bucketfs_operations.upload_model_files_to_bucketfs(
         local_model_path, upload_path, bucketfs_location)
 

diff --git a/exasol_transformers_extension/utils/bucketfs_model_uploader.py b/exasol_transformers_extension/utils/bucketfs_model_uploader.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 
+from exasol_bucketfs_utils_python.abstract_bucketfs_location import AbstractBucketFSLocation
 from exasol_bucketfs_utils_python.bucketfs_location import BucketFSLocation
 
 from exasol_transformers_extension.utils import bucketfs_operations

diff --git a/exasol_transformers_extension/utils/bucketfs_operations.py b/exasol_transformers_extension/utils/bucketfs_operations.py
@@ -41,10 +41,13 @@ def create_bucketfs_location(
 def upload_model_files_to_bucketfs(
         tmpdir_name: str, model_path: Path,
         bucketfs_location: AbstractBucketFSLocation) -> Path:
+    """
+    uploads model in tmpdir_name to model_path in bucketfs_location
+    """
     with tempfile.TemporaryFile() as fileobj:
         create_tar_of_directory(Path(tmpdir_name), fileobj)
-        model_tar_file = model_path.with_suffix(".tar.gz")
-        return upload_file_to_bucketfs_with_retry(bucketfs_location, fileobj, model_tar_file)
+        model_upload_tar_file_path = model_path.with_suffix(".tar.gz")
+        return upload_file_to_bucketfs_with_retry(bucketfs_location, fileobj, model_upload_tar_file_path)
 
 
 @retry(wait=wait_fixed(2), stop=stop_after_attempt(10))
@@ -70,3 +73,7 @@ def get_local_bucketfs_path(
 
 def get_model_path(sub_dir: str, model_name: str) -> Path:
     return Path(sub_dir, model_name.replace('-', '_'))
+
+
+def get_model_path_with_pretrained(sub_dir: str, model_name: str) -> Path:
+    return Path(sub_dir, model_name.replace('-', '_'), "pretrained" , model_name)
diff --git a/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer.py b/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer.py
diff --git a/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py b/exasol_transformers_extension/utils/huggingface_hub_bucketfs_model_transfer_sp.py
@@ -1,24 +1,22 @@
 from pathlib import Path
 
+from exasol_bucketfs_utils_python.abstract_bucketfs_location import AbstractBucketFSLocation
 from exasol_bucketfs_utils_python.bucketfs_location import BucketFSLocation
 
 from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol
 from exasol_transformers_extension.utils.bucketfs_model_uploader import BucketFSModelUploaderFactory
 from exasol_transformers_extension.utils.temporary_directory_factory import TemporaryDirectoryFactory
 
 
-
-
-
 class HuggingFaceHubBucketFSModelTransferSP:
     """
-    Class for downloading a model using the Huggingface Transformers API, and loading it into the BucketFS
-    using save_pretrained.
+    Class for downloading a model using the Huggingface Transformers API, saving it locally using
+    transformers save_pretrained, and loading the saved model files into the BucketFS.
 
-    :bucketfs_location:     BucketFSLocation the model should be loaded to
-    :model_name:            Name of the model to be downloaded using Huggingface Transformers API
-    :model_path:            Path the model will be loaded into the BucketFS at
-    :token:                 Huggingface token, only needed for private models
+    :bucketfs_location:                 BucketFSLocation the model should be loaded to
+    :model_name:                        Name of the model to be downloaded using Huggingface Transformers API
+    :model_path:                        Path the model will be loaded into the BucketFS at
+    :token:                             Huggingface token, only needed for private models
     :temporary_directory_factory:       Optional. Default is TemporaryDirectoryFactory. Mainly change for testing.
     :bucketfs_model_uploader_factory:   Optional. Default is BucketFSModelUploaderFactory. Mainly change for testing.
     """
@@ -50,9 +48,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     def download_from_huggingface_hub(self, model_factory: ModelFactoryProtocol):
         """
         Download a model from HuggingFace Hub into a temporary directory and save it with save_pretrained
-        in temporary directory / pretrained .
+        in temporary directory / pretrained / model_name.
         """
-        model = model_factory.from_pretrained(self._model_name, cache_dir=self._tmpdir_name / "cache", use_auth_token=self._token)
+        model = model_factory.from_pretrained(self._model_name, cache_dir=self._tmpdir_name / "cache",
+                                              use_auth_token=self._token)
         model.save_pretrained(self._tmpdir_name / "pretrained" / self._model_name)
 
     def upload_to_bucketfs(self) -> Path:
@@ -61,7 +60,7 @@ def upload_to_bucketfs(self) -> Path:
 
         returns: Path of the uploaded model in the BucketFS
         """
-        return self._bucketfs_model_uploader.upload_directory(self._tmpdir_name / "pretrained" / self._model_name)
+        return self._bucketfs_model_uploader.upload_directory(self._tmpdir_name / "pretrained" / self._model_name) #todo should we do replace(-,_) here to?
 def get_model_path(sub_dir: str, model_name: str) -> Path: 
 model_params.base_model, tmpdir / model_params.sub_dir / model_params.base_model.replace("-", "_")): 
 def get_model_path(sub_dir: str, model_name: str) -> Path: 
 model_params.base_model, tmpdir / model_params.sub_dir / model_params.base_model.replace("-", "_")): 
 
 
 class HuggingFaceHubBucketFSModelTransferSPFactory:

diff --git a/exasol_transformers_extension/utils/load_local_model.py b/exasol_transformers_extension/utils/load_local_model.py
@@ -10,8 +10,8 @@ class LoadLocalModel:
     Class for loading locally saved models and tokenizers. Also stores information regarding the model and pipeline.
 
     :_pipeline_factory:      a function to create a transformers pipeline
-    :task_name:             name of the current task
-    :device:                device to be used for pipeline creation
+    :task_name:              name of the current task
+    :device:                 device to be used for pipeline creation, i.e "CPU"
     :_base_model_factory:    a ModelFactoryProtocol for creating the loaded model
     :_tokenizer_factory:     a ModelFactoryProtocol for creating the loaded tokenizer
     """
@@ -39,7 +39,7 @@ def load_models(self,
                     current_model_key: str
                     ) -> transformers.pipelines.Pipeline:
         """
-        Loads a locally saved model and tokenizer from "cache_dir / "pretrained" / model_name".
+        Loads a locally saved model and tokenizer from model_path.
         Returns new pipeline corresponding to the model and task.
 
         :model_path:            location of the saved model and tokenizer