From 39998beef44483cff3a891669b704f792cc58b20 Mon Sep 17 00:00:00 2001 From: mibe Date: Thu, 17 Oct 2024 10:35:43 +0100 Subject: [PATCH] #264 addressed review comments and added User Guide updates [CodeBuild] --- doc/user_guide/user_guide.md | 143 ++++++------------ exasol_transformers_extension/deploy.py | 26 ++-- .../with_db/deployment/test_deploy_cli.py | 1 - 3 files changed, 64 insertions(+), 106 deletions(-) diff --git a/doc/user_guide/user_guide.md b/doc/user_guide/user_guide.md index 8bc67ea6..5bedab74 100644 --- a/doc/user_guide/user_guide.md +++ b/doc/user_guide/user_guide.md @@ -56,8 +56,12 @@ models on the Exasol Cluster. More information on The BucketFS can be found - The Exasol cluster must already be running with version 7.1 or later. - DB connection information and credentials are needed. -### BucketFS Connection -An Exasol connection object must be created with Exasol BucketFS connection information and credentials. +### BucketFS Connection +An Exasol connection object must be created with Exasol BucketFS connection information and credentials. +Normally, the connection object is created as part of the Transformers Extension deployment +(see the [Setup section](#deploy-the-extension-to-the-database) below). This section describes how this object +can be created manually. + The format of the connection object is as following: ```sql CREATE OR REPLACE CONNECTION @@ -111,9 +115,10 @@ For more information please check the [Create Connection in Exasol](https://docs ### Huggingface token A valid token is required to download private models from the Huggingface hub and run prediction on them. -To avoid exposing such sensitive information, you can use Exasol Connection -objects. As seen in the example below, a token can be specified in the -password part of the Exasol connection object: +This token is considered sensitive information, hence it should be stored in an Exasol Connection +object. The easiest way to do this is to provide the token as an option during the extension deployment +(see the [Setup section](#deploy-the-extension-to-the-database) below). +It can also be created manually by running the following SQL command. ```sql CREATE OR REPLACE CONNECTION TO '' @@ -164,73 +169,35 @@ poetry install poetry build ``` -### The Pre-built Language Container - -This extension requires the installation of a Language Container in the Exasol Database. -The Script Language Container is a way to install the required programming language and -necessary dependencies in the Exasol Database so the UDF scripts can be executed. - -The Language Container is downloaded and installed by executing the -deployment script below. Please make sure that the version of the Language Container matches the -installed version of the Transformers Extension Package. See [the latest release](https://github.com/exasol/transformers-extension/releases) on GitHub. - - ```buildoutcfg - python -m exasol_transformers_extension.deploy language-container - ``` +### Deploy the Extension to the Database +The Transformers Extension must be deployed to the database using the following command: +```shell +python -m exasol_transformers_extension.deploy +``` -Please refer to the [Language Container Deployment Guide] for details about this command. +### The Pre-built Language Container -### Scripts Deployment +The deployment includes the installation of the Script Language Container (SLC). The SLC is a way to install +the required programming language and necessary dependencies in the Exasol Database so that UDF scripts can be +executed. The version of the installed SLC must match the version of the Transformers Extension Package. +See [the latest release](https://github.com/exasol/transformers-extension/releases) on Github. -Next you need to deploy all necessary scripts to the specified `SCHEMA` in your Exasol DB with the same -`LANGUAGE_ALIAS` using the following Python CLI command: +### List of options - ```buildoutcfg - python -m exasol_transformers_extension.deploy scripts - ``` +For information about the available options common to all Exasol extensions please refer to the +[documentation](https://github.com/exasol/python-extension-common/blob/0.8.0/doc/user_guide/user-guide.md) in the Exasol Python Extension Common package. -The choice of options is primarily determined by the storage backend being used - On-Prem or SaaS. +In addition, this extension provides the following installation options: -### List of options +| Option name | Default | Comment | +|:--------------------|:-------:|:----------------------------------------------------------------------| +| [no-]deploy-slc | True | Install SLC as part of the deployment | +| [no-]deploy-scripts | True | Install scripts as part of the deployment | +| bucketfs-conn-name | | Name of the [BucketFS connection object](#bucketfs-connection) | +| token-conn-name | | Name of the [token connection object](#huggingface-token) if required | +| token | | The [Huggingface token](#huggingface-token) if required | -The table below lists all available options. It shows which ones are applicable for On-Prem and for SaaS backends. -Unless stated otherwise in the comments column, the option is required for either or both backends. - -Some of the values, like passwords, are considered confidential. For security reasons, it is recommended to store -those values in environment variables instead of providing them in the command line. The names of the environment -variables are given in the comments column, where applicable. Alternatively, it is possible to put just the name of -an option in the command line, without providing its value. In this case, the command will prompt to enter the value -interactively. For long values, such as the SaaS account id, it is more practical to copy/paste the value from -another source. - -| Option name | On-Prem | SaaS | Comment | -|:-----------------------------|:-------:|:----:|:------------------------------------------------| -| dsn | [x] | | i.e. | -| db-user | [x] | | | -| db-pass | [x] | | Env. [DB_PASSWORD] | -| saas-url | | [x] | Optional, Env. [SAAS_HOST] | -| saas-account-id | | [x] | Env. [SAAS_ACCOUNT_ID] | -| saas-database-id | | [x] | Optional, Env. [SAAS_DATABASE_ID] | -| saas-database-name | | [x] | Optional, provide if the database_id is unknown | -| saas-token | | [x] | Env. [SAAS_TOKEN] | -| language-alias | [x] | [x] | Optional | -| schema | [x] | [x] | DB schema to deploy the scripts in | -| ssl-cert-path | [x] | [x] | Optional | -| [no_]use-ssl-cert-validation | [x] | [x] | Optional boolean, defaults to True | -| ssl-client-cert-path | [x] | | Optional | -| ssl-client-private-key | [x] | | Optional | - -### TLS/SSL options - -The `--ssl-cert-path` is needed if the TLS/SSL certificate is not in the OS truststore. -Generally speaking, this certificate is a list of trusted CA. It is needed for the server's certificate -validation by the client. -The option `--use-ssl-cert-validation`is the default, it can be disabled with `--no-use-ssl-cert-validation`. -One needs to exercise caution when turning the certificate validation off as it potentially lowers the security of the -Database connection. -The "server" certificate described above shall not be confused with the client's own certificate. -In some cases, this certificate may be requested by a server. The client certificate may or may not include -the private key. In the latter case, the key may be provided as a separate file. +The connection objects will not be created if their names are not provided. ## Store Models in BucketFS Before you can use pre-trained models, the models must be stored in the @@ -290,36 +257,24 @@ severely. Available task_types are the same as the names of our available UDFs, `translation` and`zero_shot_classification`. ### 2. Model Uploader Script -You can invoke the python script as below which allows to download the transformer -models from The Hugging Face hub to the local filesystem, and then from there to the BucketFS. - - -#### List of options - -The table below lists all available options. It shows which ones are applicable for On-Prem and for SaaS backends. -Unless stated otherwise in the comments column, the option is required for either or both backends. - -| Option name | On-Prem | SaaS | Comment | -|:-----------------------------|:-------:|:----:|:------------------------------------------------| -| bucketfs-name | [x] | | | -| bucketfs-host | [x] | | | -| bucketfs-port | [x] | | | -| bucketfs-user | [x] | | | -| bucketfs-password | [x] | | Env. [BUCKETFS_PASSWORD] | -| bucketfs-use-https | [x] | | Optional boolean, defaults to True | -| bucket | [x] | | | -| saas-url | | [x] | | -| saas-account-id | | [x] | Env. [SAAS_ACCOUNT_ID] | -| saas-database-id | | [x] | Optional, Env. [SAAS_DATABASE_ID] | -| saas-database-name | | [x] | Optional, provide if the database_id is unknown | -| saas-token | | [x] | Env. [SAAS_TOKEN] | -| model-name | [x] | [x] | | -| path-in-bucket | [x] | [x] | Root location in the bucket for all models | -| sub-dir | [x] | [x] | Sub-directory where this model should be stored | -| task_type | [x] | [x] | Name of the task you want to use the model for | -| token | [x] | [x] | Huggingface token (needed for private models) | -| [no_]use-ssl-cert-validation | [x] | [x] | Optional boolean, defaults to True | +You can invoke the Python script below which downloads the transformer +models from The Hugging Face hub to the local filesystem, then to the BucketFS. + +```shell +python -m exasol_transformers_extension.upload_model +``` + +For information about the available options common to all Exasol extensions please refer to the +[documentation](https://github.com/exasol/python-extension-common/blob/0.8.0/doc/user_guide/user-guide.md) in the Exasol Python Extension Common package. + +In addition, this command provides the following options: +| Option name | Comment | +|:---------------|:----------------------------------------------------------------| +| model-name | Name of the model, as it's seen in the Huggingface hub | +| task-type | See the explanations below | +| sub-dir | Sub-directory in the BucketFS where this model should be stored | +| token | The [Huggingface token](#huggingface-token) if required | "task_type" is a variable for the type of task you plan to use the model for. Some models can be used for multiple types of tasks, but transformers stores diff --git a/exasol_transformers_extension/deploy.py b/exasol_transformers_extension/deploy.py index 90c6ecff..e22ca7ec 100644 --- a/exasol_transformers_extension/deploy.py +++ b/exasol_transformers_extension/deploy.py @@ -1,7 +1,7 @@ import logging import click from exasol.python_extension_common.cli.std_options import ( - StdParams, StdTags, select_std_options, ParameterFormatters) + StdParams, StdTags, select_std_options, ParameterFormatters, make_option_secret) from exasol.python_extension_common.cli.language_container_deployer_cli import ( LanguageContainerDeployerCli) from exasol.python_extension_common.cli.bucketfs_conn_object_cli import BucketfsConnObjectCli @@ -36,20 +36,24 @@ def get_bool_opt_name(arg_name: str) -> str: return f'--{opt_name}/--no-{opt_name}' -opts = select_std_options([StdTags.DB, StdTags.BFS, StdTags.SLC], formatters=formatters) -opts.append(click.Option([get_bool_opt_name(DEPLOY_SLC_ARG)], type=bool, default=True)) -opts.append(click.Option([get_bool_opt_name(DEPLOY_SCRIPTS_ARG)], type=bool, default=True)) -opts.append(click.Option([get_opt_name(BUCKETFS_CONN_NAME_ARG)], type=str)) -opts.append(click.Option([get_opt_name(TOKEN_CONN_NAME_ARG)], type=str)) -opts.append(click.Option([get_opt_name(TOKEN_ARG)], type=str)) +opt_lang_alias = {'type': str, 'default': 'PYTHON3_TE'} +opt_token = {'type': str, help: 'Huggingface hub token'} +make_option_secret(opt_token) +opts = select_std_options([StdTags.DB, StdTags.BFS, StdTags.SLC], + formatters=formatters, override={StdParams.language_alias: opt_lang_alias}) +opts.append(click.Option([get_bool_opt_name(DEPLOY_SLC_ARG)], type=bool, default=True, + help='Deploy SLC')) +opts.append(click.Option([get_bool_opt_name(DEPLOY_SCRIPTS_ARG)], type=bool, default=True, + help='Deploy scripts')) +opts.append(click.Option([get_opt_name(BUCKETFS_CONN_NAME_ARG)], type=str, + help='Create BucketFS connection object with this name')) +opts.append(click.Option([get_opt_name(TOKEN_CONN_NAME_ARG)], type=str, + help='Create token connection object with this name')) +opts.append(click.Option([get_opt_name(TOKEN_ARG)], **opt_token)) def deploy(**kwargs): - # Make sure there is a valid language_alias - if not kwargs.get(StdParams.language_alias.name): - kwargs[StdParams.language_alias.name] = 'PYTHON3_TE' - # Deploy the SLC if kwargs[DEPLOY_SLC_ARG]: slc_deployer = LanguageContainerDeployerCli( diff --git a/tests/integration_tests/with_db/deployment/test_deploy_cli.py b/tests/integration_tests/with_db/deployment/test_deploy_cli.py index c41de79a..daff8431 100644 --- a/tests/integration_tests/with_db/deployment/test_deploy_cli.py +++ b/tests/integration_tests/with_db/deployment/test_deploy_cli.py @@ -16,7 +16,6 @@ LANGUAGE_ALIAS = 'TE_E2E_LANG_ALIAS' -@pytest.mark.skip('Need to sort out the model upload test first') def test_deploy_cli(pyexasol_connection, backend_aware_database_params, backend_aware_bucketfs_params,