Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#144: Extracted load_models into seperate class #161

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions doc/changes/changes_0.7.0.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Transformers Extension 0.7.0, released T.B.D

Code name: T.B.D


## Summary

T.B.D

### Features

### Bug Fixes

### Refactorings

- #144: Extracted base_udf.load_models into separate class


### Documentation



### Security
60 changes: 25 additions & 35 deletions exasol_transformers_extension/udfs/models/base_model_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from exasol_transformers_extension.deployment import constants
from exasol_transformers_extension.utils import device_management, \
bucketfs_operations, dataframe_operations
from exasol_transformers_extension.utils.load_model import LoadModel


class BaseModelUDF(ABC):
Expand All @@ -20,7 +21,6 @@ class BaseModelUDF(ABC):
- creates model pipeline through transformer api
- manages the creation of predictions and the preparation of results.
"""

def __init__(self,
exa,
batch_size,
Expand All @@ -36,15 +36,14 @@ def __init__(self,
self.task_name = task_name
self.device = None
self.cache_dir = None
self.last_loaded_model_key = None
self.last_loaded_model = None
self.last_loaded_tokenizer = None
self.model_loader = None
self.last_created_pipeline = None
self.new_columns = []

def run(self, ctx):
device_id = ctx.get_dataframe(1).iloc[0]['device_id']
self.device = device_management.get_torch_device(device_id)
self.create_model_loader()
ctx.reset()

while True:
Expand All @@ -56,6 +55,17 @@ def run(self, ctx):

self.clear_device_memory()

def create_model_loader(self):
"""
creates the model_loader. In separate function, so it can be replaced for tests since the pipeline
creation does not work with dummy data
"""
self.model_loader = LoadModel(self.pipeline,
self.base_model,
self.tokenizer,
self.task_name,
self.device)

def get_predictions_from_batch(self, batch_df: pd.DataFrame) -> pd.DataFrame:
"""
Perform separate predictions for each model in the dataframe.
Expand Down Expand Up @@ -171,11 +181,17 @@ def check_cache(self, model_df: pd.DataFrame) -> None:
token_conn = model_df["token_conn"].iloc[0]

current_model_key = (bucketfs_conn, sub_dir, model_name, token_conn)
if self.last_loaded_model_key != current_model_key:
if self.model_loader.last_loaded_model_key != current_model_key:
self.set_cache_dir(model_name, bucketfs_conn, sub_dir)
self.clear_device_memory()
self.load_models(model_name, token_conn)
self.last_loaded_model_key = current_model_key
if token_conn:
token_conn_obj = self.exa.get_connection(token_conn)
else:
token_conn_obj = None
self.last_created_pipeline = self.model_loader.load_models(model_name,
current_model_key,
self.cache_dir,
token_conn_obj)

def set_cache_dir(
self, model_name: str, bucketfs_conn_name: str,
Expand All @@ -199,36 +215,10 @@ def clear_device_memory(self):
"""
Delete models and free device memory
"""
self.last_loaded_model = None
self.last_loaded_tokenizer = None
self.model_loader.last_loaded_model = None
self.model_loader.last_loaded_tokenizer = None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would add a clear method to the ModelLoader setting these two variables to None.
If last_laded_model and last_loaded_tokenizer still need to be visible I would make them properties.

BTW, if the last_created_pipeline keeps references to these objects they won't be garbage-collected. So you probably need to it None too.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will move the clear method from base udf also

torch.cuda.empty_cache()

def load_models(self, model_name: str, token_conn_name: str) -> None:
"""
Load model and tokenizer model from the cached location in bucketfs.
If the desired model is not cached, this method will attempt to
download the model to the read-only path /bucket/.. and cause an error.
This error will be addressed in ticket
https://github.com/exasol/transformers-extension/issues/43.

:param model_name: The model name to be loaded
"""
token = False
if token_conn_name:
token_conn_obj = self.exa.get_connection(token_conn_name)
token = token_conn_obj.password

self.last_loaded_model = self.base_model.from_pretrained(
model_name, cache_dir=self.cache_dir, use_auth_token=token)
self.last_loaded_tokenizer = self.tokenizer.from_pretrained(
model_name, cache_dir=self.cache_dir, use_auth_token=token)
self.last_created_pipeline = self.pipeline(
self.task_name,
model=self.last_loaded_model,
tokenizer=self.last_loaded_tokenizer,
device=self.device,
framework="pt")

def get_prediction(self, model_df: pd.DataFrame) -> pd.DataFrame:
"""
Perform prediction of the given model and preparation of the prediction
Expand Down
48 changes: 48 additions & 0 deletions exasol_transformers_extension/utils/load_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@


class LoadModel:
def __init__(self,
pipeline,
base_model,
tokenizer,
task_name,
device
):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding type annotations and/or parameter descriptions would be helpful.

self.pipeline = pipeline
self.base_model = base_model
self.tokenizer = tokenizer
self.task_name = task_name
self.device = device
self.last_loaded_model = None
self.last_loaded_tokenizer = None
self.last_loaded_model_key = None

def load_models(self, model_name: str,
current_model_key,
cache_dir,
token_conn_obj) -> None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we provide type annotations for all parameters?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

regarding the documentation strings and stuff: this whole class will be replaced with the one that will be created in #145. so i would rather spend the time to do proper docu for the new class and leave this one as is.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say the released code should be in a state of completeness. If the class morphs into something else so will the documentation. Plus it doesn't take long to get the docstrings sorted.

"""
Load model and tokenizer model from the cached location in bucketfs.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe "Load the language model and tokenizer model" or "Load the model and tokenizer"

If the desired model is not cached, this method will attempt to
download the model to the read-only path /bucket/.. and cause an error.
This error will be addressed in ticket
https://github.com/exasol/transformers-extension/issues/43.

:param model_name: The model name to be loaded
"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"The model name to be loaded" => "The name of the model to be loaded".

Other parameters' description?

The description of the function doesn't reflect everything that the function is doing. For example, it doesn't say it will create and return a pipeline.

token = False
if token_conn_obj:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe
if token_con_obj is not None:
The object may implement some boolean conversion rules that evaluate to False.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

new bug ticket added here: #163

token = token_conn_obj.password

self.last_loaded_model = self.base_model.from_pretrained(
model_name, cache_dir=cache_dir, use_auth_token=token)
self.last_loaded_tokenizer = self.tokenizer.from_pretrained(
model_name, cache_dir=cache_dir, use_auth_token=token)
last_created_pipeline = self.pipeline(
self.task_name,
model=self.last_loaded_model,
tokenizer=self.last_loaded_tokenizer,
device=self.device,
framework="pt")
self.last_loaded_model_key = current_model_key
return last_created_pipeline
44 changes: 38 additions & 6 deletions tests/unit_tests/udfs/base_model_dummy_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,38 @@
BaseModelUDF


class DummyModelLoader:
"""
Create a Dummy model loader that does not create a transformers Pipeline,
since that fails with test data.
"""
def __init__(self,
base_model,
tokenizer,
task_name,
device
):
self.base_model = base_model
self.tokenizer = tokenizer
self.task_name = task_name
self.device = device
self.last_loaded_model = None
self.last_loaded_tokenizer = None
self.last_created_pipeline = None
self.last_loaded_model_key = None

def load_models(self, model_name: str,
current_model_key,
cache_dir,
token_conn_obj) -> None:
token = False
self.last_loaded_model = self.base_model.from_pretrained(
model_name, cache_dir=cache_dir, use_auth_token=token)
self.last_loaded_tokenizer = self.tokenizer.from_pretrained(
model_name, cache_dir=cache_dir, use_auth_token=token)
return None


Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't you use the actual ModelLoader with a pipeline-like function that does nothing and returns None?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because the model loader is only initialized at run time of the udf because it gets input about the device thats only known then. i dont know of a way to change the functioncall at that point

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If all you need is to make a LoadModel without a pipeline you could do something like this:

class DummyLoadModel(LoadModel):
    def __init__(self, base_model, tokenizer, task_name, device):
        super().__init__(self, lambda task_name, model, tokenizer, device, framework: None, 
                                    base_model, tokenizer, task_name, device)

But I am not sure you need it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are completely right, i misunderstood. changed

class DummyImplementationUDF(BaseModelUDF):
def __init__(self,
exa,
Expand Down Expand Up @@ -44,9 +76,9 @@ def create_dataframes_from_predictions(
results_df_list.append(result_df)
return results_df_list

def load_models(self, model_name: str, token_conn_name: str) -> None:
token = False
self.last_loaded_model = self.base_model.from_pretrained(
model_name, cache_dir=self.cache_dir, use_auth_token=token)
self.last_loaded_tokenizer = self.tokenizer.from_pretrained(
model_name, cache_dir=self.cache_dir, use_auth_token=token)
def create_model_loader(self):
""" overwrite the model loader creation with dummy model loader creation"""
self.model_loader = DummyModelLoader(self.base_model,
self.tokenizer,
self.task_name,
self.device)
5 changes: 3 additions & 2 deletions tests/unit_tests/udfs/test_base_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from tests.unit_tests.utils_for_udf_tests import create_mock_exa_environment, create_mock_udf_context
from tests.unit_tests.udfs.base_model_dummy_implementation import DummyImplementationUDF
from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer import ModelFactoryProtocol
from exasol_transformers_extension.utils.load_model import LoadModel
from tests.utils.mock_cast import mock_cast
import re

Expand Down Expand Up @@ -80,8 +81,8 @@ def setup_tests_and_run(bucketfs_conn_name, bucketfs_conn, sub_dir, model_name):
None)
mock_ctx = create_mock_udf_context(input_data, mock_meta)
udf = DummyImplementationUDF(exa=mock_exa,
base_model=mock_base_model_factory,
tokenizer=mock_tokenizer_factory)
base_model=mock_base_model_factory,
tokenizer=mock_tokenizer_factory)
udf.run(mock_ctx)
res = mock_ctx.output
return res, mock_meta
Expand Down
Loading