Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Huggingface model transfer class with save_pretrained in download #158

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/changes/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Changelog

* [0.7.0](changes_0.7.0.md)
* [0.6.0](changes_0.6.0.md)
* [0.5.0](changes_0.5.0.md)
* [0.4.0](changes_0.4.0.md)
Expand Down
14 changes: 10 additions & 4 deletions doc/changes/changes_0.7.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,22 @@ T.B.D

### Features

- #143: Added HuggingfaceTransfer class with save_pretrained for saving model locally
- #152: Made the container uploading and language registration two separate actions

### Bug Fixes
MarleneKress79789 marked this conversation as resolved.
Show resolved Hide resolved

- n/a

MarleneKress79789 marked this conversation as resolved.
Show resolved Hide resolved
### Refactorings

- #144: Extracted base_model_udf.load_models into separate class



### Documentation

MarleneKress79789 marked this conversation as resolved.
Show resolved Hide resolved

- n/a

### Security
- #144: Updated Cryptography to version 41.0.7

- #144: Updated Cryptography to version 41.0.7

Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
import tempfile
from pathlib import Path
from typing import Protocol, runtime_checkable


from exasol_bucketfs_utils_python.bucketfs_location import BucketFSLocation

from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol
from exasol_transformers_extension.utils.bucketfs_model_uploader import BucketFSModelUploaderFactory
from exasol_transformers_extension.utils.temporary_directory_factory import TemporaryDirectoryFactory


@runtime_checkable
class ModelFactoryProtocol(Protocol):
def from_pretrained(self, model_name: str, cache_dir: Path, use_auth_token: str):
pass


class HuggingFaceHubBucketFSModelTransfer:

def __init__(self,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from pathlib import Path

from exasol_bucketfs_utils_python.bucketfs_location import BucketFSLocation

from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol
from exasol_transformers_extension.utils.bucketfs_model_uploader import BucketFSModelUploaderFactory
from exasol_transformers_extension.utils.temporary_directory_factory import TemporaryDirectoryFactory





class HuggingFaceHubBucketFSModelTransferSP:
"""
Class for downloading a model using the Huggingface Transformers API, and loading it into the BucketFS
using save_pretrained.

:bucketfs_location: BucketFSLocation the model should be loaded to
:model_name: Name of the model to be downloaded using Huggingface Transformers API
:model_path: Path the model will be loaded into the BucketFS at
:token: Huggingface token, only needed for private models
:temporary_directory_factory: Optional. Default is TemporaryDirectoryFactory. Mainly change for testing.
:bucketfs_model_uploader_factory: Optional. Default is BucketFSModelUploaderFactory. Mainly change for testing.
"""
def __init__(self,
bucketfs_location: BucketFSLocation,
model_name: str,
model_path: Path,
token: str,
temporary_directory_factory: TemporaryDirectoryFactory = TemporaryDirectoryFactory(),
bucketfs_model_uploader_factory: BucketFSModelUploaderFactory = BucketFSModelUploaderFactory()):
self._token = token
self._model_name = model_name
self._temporary_directory_factory = temporary_directory_factory
self._bucketfs_model_uploader = bucketfs_model_uploader_factory.create(
model_path=model_path,
bucketfs_location=bucketfs_location)
self._tmpdir = temporary_directory_factory.create()
self._tmpdir_name = Path(self._tmpdir.__enter__())

def __enter__(self):
return self

def __del__(self):
self._tmpdir.cleanup()

def __exit__(self, exc_type, exc_val, exc_tb):
self._tmpdir.__exit__(exc_type, exc_val, exc_tb)

def download_from_huggingface_hub(self, model_factory: ModelFactoryProtocol):
"""
Download a model from HuggingFace Hub into a temporary directory and save it with save_pretrained
in temporary directory / pretrained .
"""
model = model_factory.from_pretrained(self._model_name, cache_dir=self._tmpdir_name / "cache", use_auth_token=self._token)
model.save_pretrained(self._tmpdir_name / "pretrained" / self._model_name)

def upload_to_bucketfs(self) -> Path:
"""
Upload the downloaded models into the BucketFS.

returns: Path of the uploaded model in the BucketFS
"""
return self._bucketfs_model_uploader.upload_directory(self._tmpdir_name / "pretrained" / self._model_name)


class HuggingFaceHubBucketFSModelTransferSPFactory:
"""
Class for creating a HuggingFaceHubBucketFSModelTransferSP object.
"""
def create(self,
bucketfs_location: BucketFSLocation,
model_name: str,
model_path: Path,
token: str) -> HuggingFaceHubBucketFSModelTransferSP:
"""
Creates a HuggingFaceHubBucketFSModelTransferSP object.

:bucketfs_location: BucketFSLocation the model should be loaded to
:model_name: Name of the model to be downloaded using Huggingface Transformers API
:model_path: Path the model will be loaded into the BucketFS at
:token: Huggingface token, only needed for private models

returns: The created HuggingFaceHubBucketFSModelTransferSP object.
"""
return HuggingFaceHubBucketFSModelTransferSP(bucketfs_location=bucketfs_location,
model_name=model_name,
model_path=model_path,
token=token)
16 changes: 16 additions & 0 deletions exasol_transformers_extension/utils/model_factory_protocol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pathlib import Path
from typing import Protocol, Union, runtime_checkable

import transformers


@runtime_checkable
class ModelFactoryProtocol(Protocol):
"""
Protocol for better type hints.
"""
def from_pretrained(self, model_name: str, cache_dir: Path, use_auth_token: str) -> transformers.PreTrainedModel:
pass

def save_pretrained(self, save_directory: Union[str, Path]):
pass
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "exasol-transformers-extension"
version = "0.5.0"
version = "0.7.0"
description = "An Exasol extension to use state-of-the-art pretrained machine learning models via the transformers api."

authors = [
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import pytest
import tempfile
from pathlib import Path
from typing import Union
from unittest.mock import create_autospec, MagicMock

from transformers import AutoModel

from exasol_transformers_extension.utils.bucketfs_model_uploader import BucketFSModelUploader, \
BucketFSModelUploaderFactory
from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import ModelFactoryProtocol, \
HuggingFaceHubBucketFSModelTransferSP
from exasol_transformers_extension.utils.temporary_directory_factory import TemporaryDirectoryFactory
from tests.utils.mock_cast import mock_cast

from tests.utils.parameters import model_params


class TestSetup:
def __init__(self, bucketfs_location):
self.bucketfs_location = bucketfs_location
self.model_factory_mock: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol)
self.temporary_directory_factory = TemporaryDirectoryFactory()
self.bucketfs_model_uploader_factory_mock: Union[BucketFSModelUploaderFactory, MagicMock] = \
create_autospec(BucketFSModelUploaderFactory)
self.bucketfs_model_uploader_mock: Union[BucketFSModelUploader, MagicMock] = \
create_autospec(BucketFSModelUploader)
mock_cast(self.bucketfs_model_uploader_factory_mock.create).side_effect = [self.bucketfs_model_uploader_mock]

self.token = "token"
model_params_ = model_params.tiny_model
self.model_name = model_params_
self.model_path = Path("test_model_path")
self.downloader = HuggingFaceHubBucketFSModelTransferSP(
bucketfs_location=self.bucketfs_location,
model_path=self.model_path,
model_name=self.model_name,
token=self.token,
temporary_directory_factory=self.temporary_directory_factory,
bucketfs_model_uploader_factory=self.bucketfs_model_uploader_factory_mock
)

def reset_mocks(self):
self.model_factory_mock.reset_mock()
self.bucketfs_model_uploader_mock.reset_mock()


def test_download_with_model(bucketfs_location):
with tempfile.TemporaryDirectory() as folder:
test_setup = TestSetup(bucketfs_location)
base_model_factory: ModelFactoryProtocol = AutoModel
test_setup.downloader.download_from_huggingface_hub(model_factory=base_model_factory)
assert AutoModel.from_pretrained(test_setup.downloader._tmpdir_name / "pretrained" / test_setup.model_name)
del test_setup.downloader


def test_download_with_duplicate_model(bucketfs_location):
with tempfile.TemporaryDirectory() as folder:
test_setup = TestSetup(bucketfs_location)
base_model_factory: ModelFactoryProtocol = AutoModel
test_setup.downloader.download_from_huggingface_hub(model_factory=base_model_factory)
test_setup.downloader.download_from_huggingface_hub(model_factory=base_model_factory)
assert AutoModel.from_pretrained(test_setup.downloader._tmpdir_name / "pretrained" / test_setup.model_name)
del test_setup.downloader
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from pathlib import Path
from typing import Union
from unittest.mock import create_autospec, MagicMock, call

from exasol_bucketfs_utils_python.bucketfs_location import BucketFSLocation

from exasol_transformers_extension.utils.bucketfs_model_uploader import BucketFSModelUploader, \
BucketFSModelUploaderFactory
from exasol_transformers_extension.utils.huggingface_hub_bucketfs_model_transfer_sp import ModelFactoryProtocol, \
HuggingFaceHubBucketFSModelTransferSP
from exasol_transformers_extension.utils.temporary_directory_factory import TemporaryDirectoryFactory
from tests.utils.mock_cast import mock_cast

from tests.utils.parameters import model_params


class TestSetup:
def __init__(self):
self.bucketfs_location_mock: Union[BucketFSLocation, MagicMock] = create_autospec(BucketFSLocation)
self.model_factory_mock: Union[ModelFactoryProtocol, MagicMock] = create_autospec(ModelFactoryProtocol)
self.temporary_directory_factory_mock: Union[TemporaryDirectoryFactory, MagicMock] = \
create_autospec(TemporaryDirectoryFactory)
self.bucketfs_model_uploader_factory_mock: Union[BucketFSModelUploaderFactory, MagicMock] = \
create_autospec(BucketFSModelUploaderFactory)
self.bucketfs_model_uploader_mock: Union[BucketFSModelUploader, MagicMock] = \
create_autospec(BucketFSModelUploader)
mock_cast(self.bucketfs_model_uploader_factory_mock.create).side_effect = [self.bucketfs_model_uploader_mock]


self.token = "token"
model_params_ = model_params.tiny_model
self.model_name = model_params_
self.model_path = Path("test_model_path")
self.downloader = HuggingFaceHubBucketFSModelTransferSP(
bucketfs_location=self.bucketfs_location_mock,
model_path=self.model_path,
model_name=self.model_name,
token=self.token,
temporary_directory_factory=self.temporary_directory_factory_mock,
bucketfs_model_uploader_factory=self.bucketfs_model_uploader_factory_mock
)

def reset_mocks(self):
self.bucketfs_location_mock.reset_mock()
self.temporary_directory_factory_mock.reset_mock()
self.model_factory_mock.reset_mock()
self.bucketfs_model_uploader_mock.reset_mock()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.bucketfs_model_uploader_factory_mock is missing

self.bucketfs_model_uploader_factory_mock.reset_mock()


def test_init():
test_setup = TestSetup()
assert test_setup.temporary_directory_factory_mock.mock_calls == [call.create(),
call.create().__enter__(),
call.create().__enter__().__fspath__()] \
and test_setup.model_factory_mock.mock_calls == [] \
and test_setup.bucketfs_location_mock.mock_calls == [] \
and mock_cast(test_setup.bucketfs_model_uploader_factory_mock.create).mock_calls == [
call.create(model_path=test_setup.model_path, bucketfs_location=test_setup.bucketfs_location_mock)
]


def test_download_function_call():
test_setup = TestSetup()
test_setup.downloader.download_from_huggingface_hub(model_factory=test_setup.model_factory_mock)
cache_dir = mock_cast(test_setup.temporary_directory_factory_mock.create().__enter__).return_value
model_save_path = Path(cache_dir) / "pretrained" / test_setup.model_name
assert test_setup.model_factory_mock.mock_calls == [
call.from_pretrained(test_setup.model_name, cache_dir=Path(cache_dir)/"cache",
MarleneKress79789 marked this conversation as resolved.
Show resolved Hide resolved
use_auth_token=test_setup.token),
call.from_pretrained().save_pretrained(model_save_path)]


def test_upload_function_call():
test_setup = TestSetup()
test_setup.downloader.download_from_huggingface_hub(model_factory=test_setup.model_factory_mock)
test_setup.reset_mocks()
cache_dir = mock_cast(test_setup.temporary_directory_factory_mock.create().__enter__).return_value
model_save_path = Path(cache_dir) / "pretrained" / test_setup.model_name
test_setup.downloader.upload_to_bucketfs()
assert mock_cast(test_setup.bucketfs_model_uploader_mock.upload_directory).mock_calls == [call(model_save_path)]
Loading