diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index 18b020e95d..c347141767 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -21,8 +21,11 @@ from google.cloud.aiplatform.datasets import Dataset from google.cloud.aiplatform.models import Endpoint from google.cloud.aiplatform.models import Model -from google.cloud.aiplatform.training_jobs import CustomTrainingJob from google.cloud.aiplatform.jobs import BatchPredictionJob +from google.cloud.aiplatform.training_jobs import ( + CustomTrainingJob, + AutoMLTablesTrainingJob, +) """ Usage: @@ -36,6 +39,7 @@ "gapic", "BatchPredictionJob", "CustomTrainingJob", + "AutoMLTablesTrainingJob", "Model", "Dataset", "Endpoint", diff --git a/google/cloud/aiplatform/schema.py b/google/cloud/aiplatform/schema.py index ae72256d6c..03a61cd0fa 100644 --- a/google/cloud/aiplatform/schema.py +++ b/google/cloud/aiplatform/schema.py @@ -21,6 +21,7 @@ class training_job: class definition: custom_task = "gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml" + tabular_task = "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_tabular_1.0.0.yaml" class dataset: diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 978509361b..0310abf1df 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -48,7 +48,6 @@ training_pipeline as gca_training_pipeline, ) - from google.cloud import storage from google.protobuf import json_format from google.protobuf import struct_pb2 @@ -180,7 +179,7 @@ def _run_job( than the one given on input. The output URI will point to a location where the user only has a read access. - training_task_inputs (~.struct.Value): + training_task_inputs (dict): Required. The training task's parameter(s), as specified in the ``training_task_definition``'s @@ -238,7 +237,9 @@ def _run_job( training_pipeline = gca_training_pipeline.TrainingPipeline( display_name=self._display_name, training_task_definition=training_task_definition, - training_task_inputs=training_task_inputs, + training_task_inputs=json_format.ParseDict( + training_task_inputs, struct_pb2.Value() + ), model_to_upload=model, input_data_config=input_data_config, ) @@ -890,14 +891,6 @@ def __init__( Required: Uri of the training container image in the GCR. requirements (Sequence[str]): List of python packages dependencies of script. - project (str): - Optional project to retrieve model from. If not set, project set in - aiplatform.init will be used. - location (str): - Optional location to retrieve model from. If not set, location set in - aiplatform.init will be used. - credentials (auth_credentials.Credentials): - Optional credentials to use to retrieve the model. model_serving_container_image_uri (str): If the training produces a managed AI Platform Model, the URI of the Model serving container suitable for serving the model produced by the @@ -1055,13 +1048,13 @@ def run( The number of accelerators to attach to a worker replica. training_fraction_split (float): The fraction of the input data that is to be - used to train the Model. + used to train the Model. This is ignored if Dataset is not provided. validation_fraction_split (float): The fraction of the input data that is to be - used to validate the Model. + used to validate the Model. This is ignored if Dataset is not provided. test_fraction_split (float): The fraction of the input data that is to be - used to evaluate the Model. + used to evaluate the Model. This is ignored if Dataset is not provided. Returns: model: The trained AI Platform Model resource or None if training did not @@ -1129,13 +1122,10 @@ def run( if args: spec["pythonPackageSpec"]["args"] = args - training_task_inputs = json_format.ParseDict( - { - "workerPoolSpecs": worker_pool_specs, - "baseOutputDirectory": {"output_uri_prefix": base_output_dir}, - }, - struct_pb2.Value(), - ) + training_task_inputs = { + "workerPoolSpecs": worker_pool_specs, + "baseOutputDirectory": {"output_uri_prefix": base_output_dir}, + } training_task_definition = schema.training_job.definition.custom_task @@ -1185,4 +1175,211 @@ def _model_upload_fail_string(self) -> str: class AutoMLTablesTrainingJob(_TrainingJob): - pass + def __init__( + self, + display_name: str, + optimization_prediction_type: str, + optimization_objective: Optional[str] = None, + column_transformations: Optional[Union[Dict, List[Dict]]] = None, + optimization_objective_recall_value: Optional[float] = None, + optimization_objective_precision_value: Optional[float] = None, + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, + ): + """Constructs a AutoML Tables Training Job. + + Args: + display_name (str): + Required. The user-defined name of this TrainingPipeline. + optimization_prediction_type (str): + The type of prediction the Model is to produce. + "classification" - Predict one out of multiple target values is + picked for each row. + "regression" - Predict a value based on its relation to other values. + This type is available only to columns that contain + semantically numeric values, i.e. integers or floating + point number, even if stored as e.g. strings. + + optimization_objective (str): + Optional. Objective function the Model is to be optimized towards. The training + task creates a Model that maximizes/minimizes the value of the objective + function over the validation set. + + The supported optimization objectives depend on the prediction type, and + in the case of classification also the number of distinct values in the + target column (two distint values -> binary, 3 or more distinct values + -> multi class). + If the field is not set, the default objective function is used. + + Classification (binary): + "maximize-au-roc" (default) - Maximize the area under the receiver + operating characteristic (ROC) curve. + "minimize-log-loss" - Minimize log loss. + "maximize-au-prc" - Maximize the area under the precision-recall curve. + "maximize-precision-at-recall" - Maximize precision for a specified + recall value. + "maximize-recall-at-precision" - Maximize recall for a specified + precision value. + + Classification (multi class): + "minimize-log-loss" (default) - Minimize log loss. + + Regression: + "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE). + "minimize-mae" - Minimize mean-absolute error (MAE). + "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE). + column_transformations (Optional[Union[Dict, List[Dict]]]): + Optional. Transformations to apply to the input columns (i.e. columns other + than the targetColumn). Each transformation may produce multiple + result values from the column's value, and all are used for training. + When creating transformation for BigQuery Struct column, the column + should be flattened using "." as the delimiter. + If an input column has no transformations on it, such a column is + ignored by the training, except for the targetColumn, which should have + no transformations defined on. + optimization_objective_recall_value (float): + Optional. Required when maximize-precision-at-recall optimizationObjective was + picked, represents the recall value at which the optimization is done. + + The minimum value is 0 and the maximum is 1.0. + optimization_objective_precision_value (float): + Optional. Required when maximize-recall-at-precision optimizationObjective was + picked, represents the precision value at which the optimization is + done. + + The minimum value is 0 and the maximum is 1.0. + project (str): + Optional. Project to run training in. Overrides project set in aiplatform.init. + location (str): + Optional. Location to run training in. Overrides location set in aiplatform.init. + credentials (auth_credentials.Credentials): + Optional. Custom credentials to use to run call training service. Overrides + credentials set in aiplatform.init. + """ + super().__init__( + display_name=display_name, + project=project, + location=location, + credentials=credentials, + ) + self._column_transformations = column_transformations + self._optimization_objective = optimization_objective + self._optimization_prediction_type = optimization_prediction_type + self._optimization_objective_recall_value = optimization_objective_recall_value + self._optimization_objective_precision_value = ( + optimization_objective_precision_value + ) + + def run( + self, + model_display_name: str, + dataset: datasets.Dataset, + target_column: str, + training_fraction_split: float = 0.8, + validation_fraction_split: float = 0.1, + test_fraction_split: float = 0.1, + weight_column: Optional[str] = None, + budget_milli_node_hours: int = 1000, + disable_early_stopping=False, + ) -> models.Model: + """Runs the training job and returns a model. + + Data fraction splits: + Any of ``training_fraction_split``, ``validation_fraction_split`` and + ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If + the provided ones sum to less than 1, the remainder is assigned to sets as + decided by AI Platform.If none of the fractions are set, by default roughly 80% + of data will be used for training, 10% for validation, and 10% for test. + + Args: + model_display_name (str): + Required. If the script produces a managed AI Platform Model. The display name of + the Model. The name can be up to 128 characters long and can be consist + of any UTF-8 characters. + dataset (aiplatform.Dataset): + Required. The dataset within the same Project from which data will be used to train the Model. The + Dataset must use schema compatible with Model being trained, + and what is compatible should be described in the used + TrainingPipeline's [training_task_definition] + [google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]. + For tabular Datasets, all their data is exported to + training, to pick and choose from. + training_fraction_split (float): + Required. The fraction of the input data that is to be + used to train the Model. This is ignored if Dataset is not provided. + validation_fraction_split (float): + Required. The fraction of the input data that is to be + used to validate the Model. This is ignored if Dataset is not provided. + test_fraction_split (float): + Required. The fraction of the input data that is to be + used to evaluate the Model. This is ignored if Dataset is not provided. + weight_column (str): + Optional. Name of the column that should be used as the weight column. + Higher values in this column give more importance to the row + during Model training. The column must have numeric values between 0 and + 10000 inclusively, and 0 value means that the row is ignored. + If the weight column field is not set, then all rows are assumed to have + equal weight of 1. + budget_milli_node_hours (int): + Optional. The train budget of creating this Model, expressed in milli node + hours i.e. 1,000 value in this field means 1 node hour. + The training cost of the model will not exceed this budget. The final + cost will be attempted to be close to the budget, though may end up + being (even) noticeably smaller - at the backend's discretion. This + especially may happen when further model training ceases to provide + any improvements. + If the budget is set to a value known to be insufficient to train a + Model for the given training set, the training won't be attempted and + will error. + The minimum value is 1000 and the maximum is 72000. + disable_early_stopping (bool): + Required. If true, the entire budget is used. This disables the early stopping + feature. By default, the early stopping feature is enabled, which means + that training might stop before the entire training budget has been + used, if futrher training does no longer brings significant improvement + to the model. + + Returns: + model: The trained AI Platform Model resource or None if training did not + produce an AI Platform Model. + + Raises: + RuntimeError if Training job has already been run + """ + + training_task_definition = schema.training_job.definition.tabular_task + + training_task_inputs_dict = { + # required inputs + "targetColumn": target_column, + "transformations": self._column_transformations, + "trainBudgetMilliNodeHours": budget_milli_node_hours, + # optional inputs + "weightColumnName": weight_column, + "disableEarlyStopping": disable_early_stopping, + "optimizationObjective": self._optimization_objective, + "predictionType": self._optimization_prediction_type, + "optimizationObjectiveRecallValue": self._optimization_objective_recall_value, + "optimizationObjectivePrecisionValue": self._optimization_objective_precision_value, + } + + model = gca_model.Model(display_name=model_display_name) + + return self._run_job( + training_task_definition=training_task_definition, + training_task_inputs=training_task_inputs_dict, + dataset=dataset, + training_fraction_split=training_fraction_split, + validation_fraction_split=validation_fraction_split, + test_fraction_split=test_fraction_split, + model=model, + ) + + @property + def _model_upload_fail_string(self) -> str: + """Helper property for model upload failure.""" + return ( + f"Training Pipeline {self.resource_name} is not configured to upload a " + "Model." + ) diff --git a/tests/unit/aiplatform/test_automl_tables_training_jobs.py b/tests/unit/aiplatform/test_automl_tables_training_jobs.py new file mode 100644 index 0000000000..ac1250d6e5 --- /dev/null +++ b/tests/unit/aiplatform/test_automl_tables_training_jobs.py @@ -0,0 +1,383 @@ +import importlib +import pytest +from unittest import mock + +from google.cloud import aiplatform +from google.cloud.aiplatform import datasets +from google.cloud.aiplatform import initializer +from google.cloud.aiplatform import schema +from google.cloud.aiplatform.training_jobs import AutoMLTablesTrainingJob + +from google.cloud.aiplatform_v1beta1.services.model_service import ( + client as model_service_client, +) +from google.cloud.aiplatform_v1beta1.services.pipeline_service import ( + client as pipeline_service_client, +) +from google.cloud.aiplatform_v1beta1.types import model as gca_model +from google.cloud.aiplatform_v1beta1.types import pipeline_state as gca_pipeline_state +from google.cloud.aiplatform_v1beta1.types import ( + training_pipeline as gca_training_pipeline, +) + +from google.protobuf import json_format +from google.protobuf import struct_pb2 + +_TEST_BUCKET_NAME = "test-bucket" +_TEST_GCS_PATH_WITHOUT_BUCKET = "path/to/folder" +_TEST_GCS_PATH = f"{_TEST_BUCKET_NAME}/{_TEST_GCS_PATH_WITHOUT_BUCKET}" +_TEST_GCS_PATH_WITH_TRAILING_SLASH = f"{_TEST_GCS_PATH}/" +_TEST_PROJECT = "test-project" + +_TEST_DISPLAY_NAME = "test-display-name" +_TEST_TRAINING_CONTAINER_IMAGE = "gcr.io/test-training/container:image" + +_TEST_TRAINING_COLUMN_TRANSFORMATIONS = [ + {"auto": {"column_name": "sepal_width"}}, + {"auto": {"column_name": "sepal_length"}}, + {"auto": {"column_name": "petal_length"}}, + {"auto": {"column_name": "petal_width"}}, +] +_TEST_TRAINING_TARGET_COLUMN = "target" +_TEST_TRAINING_BUDGET_MILLI_NODE_HOURS = 1000 +_TEST_TRAINING_WEIGHT_COLUMN = "weight" +_TEST_TRAINING_DISABLE_EARLY_STOPPING = True +_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME = "minimize-log-loss" +_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE = "classification" +_TEST_TRAINING_TASK_INPUTS = json_format.ParseDict( + { + # required inputs + "targetColumn": _TEST_TRAINING_TARGET_COLUMN, + "transformations": _TEST_TRAINING_COLUMN_TRANSFORMATIONS, + "trainBudgetMilliNodeHours": _TEST_TRAINING_BUDGET_MILLI_NODE_HOURS, + # optional inputs + "weightColumnName": _TEST_TRAINING_WEIGHT_COLUMN, + "disableEarlyStopping": _TEST_TRAINING_DISABLE_EARLY_STOPPING, + "predictionType": _TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, + "optimizationObjective": _TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, + "optimizationObjectiveRecallValue": None, + "optimizationObjectivePrecisionValue": None, + }, + struct_pb2.Value(), +) + +_TEST_DATASET_NAME = "test-dataset-name" + +_TEST_MODEL_DISPLAY_NAME = "model-display-name" +_TEST_TRAINING_FRACTION_SPLIT = 0.6 +_TEST_VALIDATION_FRACTION_SPLIT = 0.2 +_TEST_TEST_FRACTION_SPLIT = 0.2 + +_TEST_OUTPUT_PYTHON_PACKAGE_PATH = "gs://test/ouput/python/trainer.tar.gz" + +_TEST_MODEL_NAME = "projects/my-project/locations/us-central1/models/12345" + +_TEST_PIPELINE_RESOURCE_NAME = ( + "projects/my-project/locations/us-central1/trainingPipeline/12345" +) + + +class TestAutoMLTablesTrainingJob: + def setup_method(self): + importlib.reload(initializer) + importlib.reload(aiplatform) + + @pytest.fixture + def mock_pipeline_service_create(self): + with mock.patch.object( + pipeline_service_client.PipelineServiceClient, "create_training_pipeline" + ) as mock_create_training_pipeline: + mock_create_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline( + name=_TEST_PIPELINE_RESOURCE_NAME, + state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED, + model_to_upload=gca_model.Model(name=_TEST_MODEL_NAME), + ) + yield mock_create_training_pipeline + + @pytest.fixture + def mock_pipeline_service_create_with_no_model_to_upload(self): + with mock.patch.object( + pipeline_service_client.PipelineServiceClient, "create_training_pipeline" + ) as mock_create_training_pipeline: + mock_create_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline( + name=_TEST_PIPELINE_RESOURCE_NAME, + state=gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED, + ) + yield mock_create_training_pipeline + + @pytest.fixture + def mock_pipeline_service_create_and_get_with_fail(self): + with mock.patch.object( + pipeline_service_client.PipelineServiceClient, "create_training_pipeline" + ) as mock_create_training_pipeline: + mock_create_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline( + name=_TEST_PIPELINE_RESOURCE_NAME, + state=gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING, + ) + + with mock.patch.object( + pipeline_service_client.PipelineServiceClient, "get_training_pipeline" + ) as mock_get_training_pipeline: + mock_get_training_pipeline.return_value = gca_training_pipeline.TrainingPipeline( + name=_TEST_PIPELINE_RESOURCE_NAME, + state=gca_pipeline_state.PipelineState.PIPELINE_STATE_FAILED, + ) + + yield mock_create_training_pipeline, mock_get_training_pipeline + + @pytest.fixture + def mock_model_service_get(self): + with mock.patch.object( + model_service_client.ModelServiceClient, "get_model" + ) as mock_get_model: + mock_get_model.return_value = gca_model.Model() + yield mock_get_model + + @pytest.fixture + def mock_dataset(self): + ds = mock.MagicMock(datasets.Dataset) + ds.name = _TEST_DATASET_NAME + return ds + + def test_run_call_pipeline_service_create( + self, mock_pipeline_service_create, mock_dataset, mock_model_service_get, + ): + aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) + + job = AutoMLTablesTrainingJob( + display_name=_TEST_DISPLAY_NAME, + optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, + optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, + column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, + optimization_objective_recall_value=None, + optimization_objective_precision_value=None, + ) + + model_from_job = job.run( + dataset=mock_dataset, + target_column=_TEST_TRAINING_TARGET_COLUMN, + model_display_name=_TEST_MODEL_DISPLAY_NAME, + training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction_split=_TEST_TEST_FRACTION_SPLIT, + weight_column=_TEST_TRAINING_WEIGHT_COLUMN, + budget_milli_node_hours=_TEST_TRAINING_BUDGET_MILLI_NODE_HOURS, + disable_early_stopping=_TEST_TRAINING_DISABLE_EARLY_STOPPING, + ) + + true_fraction_split = gca_training_pipeline.FractionSplit( + training_fraction=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction=_TEST_TEST_FRACTION_SPLIT, + ) + + true_managed_model = gca_model.Model(display_name=_TEST_MODEL_DISPLAY_NAME) + + true_input_data_config = gca_training_pipeline.InputDataConfig( + fraction_split=true_fraction_split, dataset_id=mock_dataset.name, + ) + + true_training_pipeline = gca_training_pipeline.TrainingPipeline( + display_name=_TEST_DISPLAY_NAME, + training_task_definition=schema.training_job.definition.tabular_task, + training_task_inputs=_TEST_TRAINING_TASK_INPUTS, + model_to_upload=true_managed_model, + input_data_config=true_input_data_config, + ) + + mock_pipeline_service_create.assert_called_once_with( + parent=initializer.global_config.common_location_path(), + training_pipeline=true_training_pipeline, + ) + + assert job._gca_resource is mock_pipeline_service_create.return_value + + mock_model_service_get.assert_called_once_with(name=_TEST_MODEL_NAME) + + assert model_from_job._gca_resource is mock_model_service_get.return_value + + assert job.get_model()._gca_resource is mock_model_service_get.return_value + + assert not job.has_failed + + assert job.state == gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + + def test_run_called_twice_raises( + self, mock_pipeline_service_create, mock_dataset, mock_model_service_get, + ): + aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) + + job = AutoMLTablesTrainingJob( + display_name=_TEST_DISPLAY_NAME, + optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, + optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, + column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, + optimization_objective_recall_value=None, + optimization_objective_precision_value=None, + ) + + job.run( + dataset=mock_dataset, + target_column=_TEST_TRAINING_TARGET_COLUMN, + model_display_name=_TEST_MODEL_DISPLAY_NAME, + training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction_split=_TEST_TEST_FRACTION_SPLIT, + ) + + with pytest.raises(RuntimeError): + job.run( + dataset=mock_dataset, + target_column=_TEST_TRAINING_TARGET_COLUMN, + model_display_name=_TEST_MODEL_DISPLAY_NAME, + training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction_split=_TEST_TEST_FRACTION_SPLIT, + ) + + def test_run_call_pipeline_service_create_with_no_dataset( + self, mock_pipeline_service_create, mock_model_service_get, + ): + aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) + + job = AutoMLTablesTrainingJob( + display_name=_TEST_DISPLAY_NAME, + optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, + optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, + column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, + optimization_objective_recall_value=None, + optimization_objective_precision_value=None, + ) + + model_from_job = job.run( + dataset=None, + target_column=_TEST_TRAINING_TARGET_COLUMN, + training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction_split=_TEST_TEST_FRACTION_SPLIT, + weight_column=_TEST_TRAINING_WEIGHT_COLUMN, + budget_milli_node_hours=_TEST_TRAINING_BUDGET_MILLI_NODE_HOURS, + model_display_name=_TEST_MODEL_DISPLAY_NAME, + disable_early_stopping=_TEST_TRAINING_DISABLE_EARLY_STOPPING, + ) + + true_managed_model = gca_model.Model(display_name=_TEST_MODEL_DISPLAY_NAME) + + true_training_pipeline = gca_training_pipeline.TrainingPipeline( + display_name=_TEST_DISPLAY_NAME, + training_task_definition=schema.training_job.definition.tabular_task, + training_task_inputs=_TEST_TRAINING_TASK_INPUTS, + model_to_upload=true_managed_model, + input_data_config=None, + ) + + mock_pipeline_service_create.assert_called_once_with( + parent=initializer.global_config.common_location_path(), + training_pipeline=true_training_pipeline, + ) + + assert job._gca_resource is mock_pipeline_service_create.return_value + + mock_model_service_get.assert_called_once_with(name=_TEST_MODEL_NAME) + + assert model_from_job._gca_resource is mock_model_service_get.return_value + + def test_run_returns_none_if_no_model_to_upload( + self, mock_pipeline_service_create_with_no_model_to_upload, mock_dataset, + ): + aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) + + job = AutoMLTablesTrainingJob( + display_name=_TEST_DISPLAY_NAME, + optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, + optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, + column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, + optimization_objective_recall_value=None, + optimization_objective_precision_value=None, + ) + + model = job.run( + model_display_name=_TEST_MODEL_DISPLAY_NAME, + dataset=mock_dataset, + target_column=_TEST_TRAINING_TARGET_COLUMN, + training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction_split=_TEST_TEST_FRACTION_SPLIT, + ) + + assert model is None + + def test_get_model_raises_if_no_model_to_upload( + self, mock_pipeline_service_create_with_no_model_to_upload, mock_dataset, + ): + aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) + + job = AutoMLTablesTrainingJob( + display_name=_TEST_DISPLAY_NAME, + optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, + optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, + column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, + optimization_objective_recall_value=None, + optimization_objective_precision_value=None, + ) + + job.run( + model_display_name=_TEST_MODEL_DISPLAY_NAME, + dataset=mock_dataset, + target_column=_TEST_TRAINING_TARGET_COLUMN, + training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction_split=_TEST_TEST_FRACTION_SPLIT, + ) + + with pytest.raises(RuntimeError): + job.get_model() + + def test_run_raises_if_pipeline_fails( + self, mock_pipeline_service_create_and_get_with_fail, mock_dataset, + ): + + aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) + + job = AutoMLTablesTrainingJob( + display_name=_TEST_DISPLAY_NAME, + optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, + optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, + column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, + optimization_objective_recall_value=None, + optimization_objective_precision_value=None, + ) + + with pytest.raises(RuntimeError): + job.run( + model_display_name=_TEST_MODEL_DISPLAY_NAME, + dataset=mock_dataset, + target_column=_TEST_TRAINING_TARGET_COLUMN, + training_fraction_split=_TEST_TRAINING_FRACTION_SPLIT, + validation_fraction_split=_TEST_VALIDATION_FRACTION_SPLIT, + test_fraction_split=_TEST_TEST_FRACTION_SPLIT, + ) + + with pytest.raises(RuntimeError): + job.get_model() + + def test_raises_before_run_is_called(self, mock_pipeline_service_create): + aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) + + job = AutoMLTablesTrainingJob( + display_name=_TEST_DISPLAY_NAME, + optimization_prediction_type=_TEST_TRAINING_OPTIMIZATION_PREDICTION_TYPE, + optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, + column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, + optimization_objective_recall_value=None, + optimization_objective_precision_value=None, + ) + + with pytest.raises(RuntimeError): + job.get_model() + + with pytest.raises(RuntimeError): + job.has_failed + + with pytest.raises(RuntimeError): + job.state