From 643d335693ec57848949ee173401867a1188678b Mon Sep 17 00:00:00 2001 From: Michael Hu Date: Fri, 3 Jun 2022 15:28:20 -0400 Subject: [PATCH] feat: add seq2seq forecasting training job (#1196) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [x] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-aiplatform/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [x] Ensure the tests and linter pass - [x] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) Fixes b/229909845 🦕 --- Adds a `SequenceToSequencePlusForecastingTrainingJob` to training jobs. This job has the exact same signature as `AutoMLForecastingTrainingJob`, but we are creating a separate job in case the two models diverge in the future. The logic for `AutoMLForecastingTrainingJob` has been moved to a new abstract base class `_ForecastingTrainingJob`. The only things that differ between the seq2seq and automl training jobs that extend it are the `model_type` and `training_task_definition`. --- google/cloud/aiplatform/__init__.py | 2 + google/cloud/aiplatform/schema.py | 1 + google/cloud/aiplatform/training_jobs.py | 3294 +++++++++-------- .../system/aiplatform/test_e2e_forecasting.py | 67 +- .../test_automl_forecasting_training_jobs.py | 145 +- 5 files changed, 1890 insertions(+), 1619 deletions(-) diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index db7d0a7c18..1ad69b2a54 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -63,6 +63,7 @@ CustomPythonPackageTrainingJob, AutoMLTabularTrainingJob, AutoMLForecastingTrainingJob, + SequenceToSequencePlusForecastingTrainingJob, AutoMLImageTrainingJob, AutoMLTextTrainingJob, AutoMLVideoTrainingJob, @@ -116,6 +117,7 @@ "Model", "ModelEvaluation", "PipelineJob", + "SequenceToSequencePlusForecastingTrainingJob", "TabularDataset", "Tensorboard", "TensorboardExperiment", diff --git a/google/cloud/aiplatform/schema.py b/google/cloud/aiplatform/schema.py index a1da75d9e6..96a7a50bbd 100644 --- a/google/cloud/aiplatform/schema.py +++ b/google/cloud/aiplatform/schema.py @@ -23,6 +23,7 @@ class definition: custom_task = "gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml" automl_tabular = "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_tabular_1.0.0.yaml" automl_forecasting = "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_time_series_forecasting_1.0.0.yaml" + seq2seq_plus_forecasting = "gs://google-cloud-aiplatform/schema/trainingjob/definition/seq2seq_plus_time_series_forecasting_1.0.0.yaml" automl_image_classification = "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_image_classification_1.0.0.yaml" automl_image_object_detection = "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_image_object_detection_1.0.0.yaml" automl_text_classification = "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_text_classification_1.0.0.yaml" diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index a6244e08ca..2b246e113a 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -1561,176 +1561,70 @@ def _model_upload_fail_string(self) -> str: ) -# TODO(b/172368325) add scheduling, custom_job.Scheduling -class CustomTrainingJob(_CustomTrainingJob): - """Class to launch a Custom Training Job in Vertex AI using a script. +class _ForecastingTrainingJob(_TrainingJob): + """ABC for Forecasting Training Pipelines.""" - Takes a training implementation as a python script and executes that - script in Cloud Vertex AI Training. - """ + _supported_training_schemas = tuple() def __init__( self, - # TODO(b/223262536): Make display_name parameter fully optional in next major release - display_name: str, - script_path: str, - container_uri: str, - requirements: Optional[Sequence[str]] = None, - model_serving_container_image_uri: Optional[str] = None, - model_serving_container_predict_route: Optional[str] = None, - model_serving_container_health_route: Optional[str] = None, - model_serving_container_command: Optional[Sequence[str]] = None, - model_serving_container_args: Optional[Sequence[str]] = None, - model_serving_container_environment_variables: Optional[Dict[str, str]] = None, - model_serving_container_ports: Optional[Sequence[int]] = None, - model_description: Optional[str] = None, - model_instance_schema_uri: Optional[str] = None, - model_parameters_schema_uri: Optional[str] = None, - model_prediction_schema_uri: Optional[str] = None, + display_name: Optional[str] = None, + optimization_objective: Optional[str] = None, + column_specs: Optional[Dict[str, str]] = None, + column_transformations: Optional[List[Dict[str, Dict[str, str]]]] = None, project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, labels: Optional[Dict[str, str]] = None, training_encryption_spec_key_name: Optional[str] = None, model_encryption_spec_key_name: Optional[str] = None, - staging_bucket: Optional[str] = None, ): - """Constructs a Custom Training Job from a Python script. - - job = aiplatform.CustomTrainingJob( - display_name='test-train', - script_path='test_script.py', - requirements=['pandas', 'numpy'], - container_uri='gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest', - model_serving_container_image_uri='gcr.io/my-trainer/serving:1', - model_serving_container_predict_route='predict', - model_serving_container_health_route='metadata, - labels={'key': 'value'}, - ) - - Usage with Dataset: - - ds = aiplatform.TabularDataset( - 'projects/my-project/locations/us-central1/datasets/12345') - - job.run( - ds, - replica_count=1, - model_display_name='my-trained-model', - model_labels={'key': 'value'}, - ) - - Usage without Dataset: - - job.run(replica_count=1, model_display_name='my-trained-model) - - - TODO(b/169782082) add documentation about traning utilities - To ensure your model gets saved in Vertex AI, write your saved model to - os.environ["AIP_MODEL_DIR"] in your provided training script. - + """Constructs a Forecasting Training Job. Args: display_name (str): - Required. The user-defined name of this TrainingPipeline. - script_path (str): Required. Local path to training script. - container_uri (str): - Required: Uri of the training container image in the GCR. - requirements (Sequence[str]): - List of python packages dependencies of script. - model_serving_container_image_uri (str): - If the training produces a managed Vertex AI Model, the URI of the - Model serving container suitable for serving the model produced by the - training script. - model_serving_container_predict_route (str): - If the training produces a managed Vertex AI Model, An HTTP path to - send prediction requests to the container, and which must be supported - by it. If not specified a default HTTP path will be used by Vertex AI. - model_serving_container_health_route (str): - If the training produces a managed Vertex AI Model, an HTTP path to - send health check requests to the container, and which must be supported - by it. If not specified a standard HTTP path will be used by AI - Platform. - model_serving_container_command (Sequence[str]): - The command with which the container is run. Not executed within a - shell. The Docker image's ENTRYPOINT is used if this is not provided. - Variable references $(VAR_NAME) are expanded using the container's - environment. If a variable cannot be resolved, the reference in the - input string will be unchanged. The $(VAR_NAME) syntax can be escaped - with a double $$, ie: $$(VAR_NAME). Escaped references will never be - expanded, regardless of whether the variable exists or not. - model_serving_container_args (Sequence[str]): - The arguments to the command. The Docker image's CMD is used if this is - not provided. Variable references $(VAR_NAME) are expanded using the - container's environment. If a variable cannot be resolved, the reference - in the input string will be unchanged. The $(VAR_NAME) syntax can be - escaped with a double $$, ie: $$(VAR_NAME). Escaped references will - never be expanded, regardless of whether the variable exists or not. - model_serving_container_environment_variables (Dict[str, str]): - The environment variables that are to be present in the container. - Should be a dictionary where keys are environment variable names - and values are environment variable values for those names. - model_serving_container_ports (Sequence[int]): - Declaration of ports that are exposed by the container. This field is - primarily informational, it gives Vertex AI information about the - network connections the container uses. Listing or not a port here has - no impact on whether the port is actually exposed, any port listening on - the default "0.0.0.0" address inside a container will be accessible from - the network. - model_description (str): - The description of the Model. - model_instance_schema_uri (str): - Optional. Points to a YAML file stored on Google Cloud - Storage describing the format of a single instance, which - are used in - ``PredictRequest.instances``, - ``ExplainRequest.instances`` - and - ``BatchPredictionJob.input_config``. - The schema is defined as an OpenAPI 3.0.2 `Schema - Object `__. - AutoML Models always have this field populated by AI - Platform. Note: The URI given on output will be immutable - and probably different, including the URI scheme, than the - one given on input. The output URI will point to a location - where the user only has a read access. - model_parameters_schema_uri (str): - Optional. Points to a YAML file stored on Google Cloud - Storage describing the parameters of prediction and - explanation via - ``PredictRequest.parameters``, - ``ExplainRequest.parameters`` - and - ``BatchPredictionJob.model_parameters``. - The schema is defined as an OpenAPI 3.0.2 `Schema - Object `__. - AutoML Models always have this field populated by AI - Platform, if no parameters are supported it is set to an - empty string. Note: The URI given on output will be - immutable and probably different, including the URI scheme, - than the one given on input. The output URI will point to a - location where the user only has a read access. - model_prediction_schema_uri (str): - Optional. Points to a YAML file stored on Google Cloud - Storage describing the format of a single prediction - produced by this Model, which are returned via - ``PredictResponse.predictions``, - ``ExplainResponse.explanations``, - and - ``BatchPredictionJob.output_config``. - The schema is defined as an OpenAPI 3.0.2 `Schema - Object `__. - AutoML Models always have this field populated by AI - Platform. Note: The URI given on output will be immutable - and probably different, including the URI scheme, than the - one given on input. The output URI will point to a location - where the user only has a read access. + Optional. The user-defined name of this TrainingPipeline. + optimization_objective (str): + Optional. Objective function the model is to be optimized towards. + The training process creates a Model that optimizes the value of the objective + function over the validation set. The supported optimization objectives: + "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE). + "minimize-mae" - Minimize mean-absolute error (MAE). + "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE). + "minimize-rmspe" - Minimize root-mean-squared percentage error (RMSPE). + "minimize-wape-mae" - Minimize the combination of weighted absolute percentage error (WAPE) + and mean-absolute-error (MAE). + "minimize-quantile-loss" - Minimize the quantile loss at the defined quantiles. + (Set this objective to build quantile forecasts.) + column_specs (Dict[str, str]): + Optional. Alternative to column_transformations where the keys of the dict + are column names and their respective values are one of + AutoMLTabularTrainingJob.column_data_types. + When creating transformation for BigQuery Struct column, the column + should be flattened using "." as the delimiter. Only columns with no child + should have a transformation. + If an input column has no transformations on it, such a column is + ignored by the training, except for the targetColumn, which should have + no transformations defined on. + Only one of column_transformations or column_specs should be passed. + column_transformations (List[Dict[str, Dict[str, str]]]): + Optional. Transformations to apply to the input columns (i.e. columns other + than the targetColumn). Each transformation may produce multiple + result values from the column's value, and all are used for training. + When creating transformation for BigQuery Struct column, the column + should be flattened using "." as the delimiter. Only columns with no child + should have a transformation. + If an input column has no transformations on it, such a column is + ignored by the training, except for the targetColumn, which should have + no transformations defined on. + Only one of column_transformations or column_specs should be passed. + Consider using column_specs as column_transformations will be deprecated eventually. project (str): - Project to run training in. Overrides project set in aiplatform.init. + Optional. Project to run training in. Overrides project set in aiplatform.init. location (str): - Location to run training in. Overrides location set in aiplatform.init. + Optional. Location to run training in. Overrides location set in aiplatform.init. credentials (auth_credentials.Credentials): - Custom credentials to use to run call training service. Overrides + Optional. Custom credentials to use to run call training service. Overrides credentials set in aiplatform.init. labels (Dict[str, str]): Optional. The labels with user-defined metadata to @@ -1749,12 +1643,9 @@ def __init__( ``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``. The key needs to be in the same region as where the compute resource is created. - If set, this TrainingPipeline will be secured by this key. - Note: Model trained by this TrainingPipeline is also secured by this key if ``model_to_upload`` is not set separately. - Overrides encryption_spec_key_name set in aiplatform.init. model_encryption_spec_key_name (Optional[str]): Optional. The Cloud KMS resource identifier of the customer @@ -1763,16 +1654,11 @@ def __init__( ``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``. The key needs to be in the same region as where the compute resource is created. - If set, the trained Model will be secured by this key. - Overrides encryption_spec_key_name set in aiplatform.init. - staging_bucket (str): - Bucket used to stage source and training artifacts. Overrides - staging_bucket set in aiplatform.init. + Raises: + ValueError: If both column_transformations and column_specs were provided. """ - if not display_name: - display_name = self.__class__._generate_display_name() super().__init__( display_name=display_name, project=project, @@ -1781,74 +1667,76 @@ def __init__( labels=labels, training_encryption_spec_key_name=training_encryption_spec_key_name, model_encryption_spec_key_name=model_encryption_spec_key_name, - container_uri=container_uri, - model_instance_schema_uri=model_instance_schema_uri, - model_parameters_schema_uri=model_parameters_schema_uri, - model_prediction_schema_uri=model_prediction_schema_uri, - model_serving_container_environment_variables=model_serving_container_environment_variables, - model_serving_container_ports=model_serving_container_ports, - model_serving_container_image_uri=model_serving_container_image_uri, - model_serving_container_command=model_serving_container_command, - model_serving_container_args=model_serving_container_args, - model_serving_container_predict_route=model_serving_container_predict_route, - model_serving_container_health_route=model_serving_container_health_route, - model_description=model_description, - staging_bucket=staging_bucket, ) - self._requirements = requirements - self._script_path = script_path + self._column_transformations = ( + column_transformations_utils.validate_and_get_column_transformations( + column_specs, + column_transformations, + ) + ) + + self._optimization_objective = optimization_objective + self._additional_experiments = [] + + @property + @classmethod + @abc.abstractmethod + def _model_type(cls) -> str: + """The type of forecasting model.""" + pass + + @property + @classmethod + @abc.abstractmethod + def _training_task_definition(cls) -> str: + """A GCS path to the YAML file that defines the training task. + + The definition files that can be used here are found in + gs://google-cloud-aiplatform/schema/trainingjob/definition/. + """ + pass def run( self, - dataset: Optional[ - Union[ - datasets.ImageDataset, - datasets.TabularDataset, - datasets.TextDataset, - datasets.VideoDataset, - ] - ] = None, - annotation_schema_uri: Optional[str] = None, - model_display_name: Optional[str] = None, - model_labels: Optional[Dict[str, str]] = None, - base_output_dir: Optional[str] = None, - service_account: Optional[str] = None, - network: Optional[str] = None, - bigquery_destination: Optional[str] = None, - args: Optional[List[Union[str, float, int]]] = None, - environment_variables: Optional[Dict[str, str]] = None, - replica_count: int = 1, - machine_type: str = "n1-standard-4", - accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", - accelerator_count: int = 0, - boot_disk_type: str = "pd-ssd", - boot_disk_size_gb: int = 100, - reduction_server_replica_count: int = 0, - reduction_server_machine_type: Optional[str] = None, - reduction_server_container_uri: Optional[str] = None, + dataset: datasets.TimeSeriesDataset, + target_column: str, + time_column: str, + time_series_identifier_column: str, + unavailable_at_forecast_columns: List[str], + available_at_forecast_columns: List[str], + forecast_horizon: int, + data_granularity_unit: str, + data_granularity_count: int, training_fraction_split: Optional[float] = None, validation_fraction_split: Optional[float] = None, test_fraction_split: Optional[float] = None, - training_filter_split: Optional[str] = None, - validation_filter_split: Optional[str] = None, - test_filter_split: Optional[str] = None, predefined_split_column_name: Optional[str] = None, timestamp_split_column_name: Optional[str] = None, - timeout: Optional[int] = None, - restart_job_on_worker_restart: bool = False, - enable_web_access: bool = False, - tensorboard: Optional[str] = None, - sync=True, + weight_column: Optional[str] = None, + time_series_attribute_columns: Optional[List[str]] = None, + context_window: Optional[int] = None, + export_evaluated_data_items: bool = False, + export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None, + export_evaluated_data_items_override_destination: bool = False, + quantiles: Optional[List[float]] = None, + validation_options: Optional[str] = None, + budget_milli_node_hours: int = 1000, + model_display_name: Optional[str] = None, + model_labels: Optional[Dict[str, str]] = None, + additional_experiments: Optional[List[str]] = None, + hierarchy_group_columns: Optional[List[str]] = None, + hierarchy_group_total_weight: Optional[float] = None, + hierarchy_temporal_total_weight: Optional[float] = None, + hierarchy_group_temporal_total_weight: Optional[float] = None, + window_column: Optional[str] = None, + window_stride_length: Optional[int] = None, + window_max_count: Optional[int] = None, + holiday_regions: Optional[List[str]] = None, + sync: bool = True, create_request_timeout: Optional[float] = None, - ) -> Optional[models.Model]: - """Runs the custom training job. - - Distributed Training Support: - If replica count = 1 then one chief replica will be provisioned. If - replica_count > 1 the remainder will be provisioned as a worker replica pool. - ie: replica_count = 10 will result in 1 chief and 9 workers - All replicas have same machine_type, accelerator_type, and accelerator_count + ) -> models.Model: + """Runs the training job and returns a model. If training on a Vertex AI dataset, you can use one of the following split configurations: Data fraction splits: @@ -1858,17 +1746,6 @@ def run( decided by Vertex AI. If none of the fractions are set, by default roughly 80% of data will be used for training, 10% for validation, and 10% for test. - Data filter splits: - Assigns input data to training, validation, and test sets - based on the given filters, data pieces not matched by any - filter are ignored. Currently only supported for Datasets - containing DataItems. - If any of the filters in this message are to match nothing, then - they can be set as '-' (the minus sign). - If using filter splits, all of ``training_filter_split``, ``validation_filter_split`` and - ``test_filter_split`` must be provided. - Supported only for unstructured Datasets. - Predefined splits: Assigns input data to training, validation, and test sets based on the value of a provided key. If using predefined splits, ``predefined_split_column_name`` must be provided. @@ -1882,48 +1759,125 @@ def run( Supported only for tabular Datasets. Args: - dataset ( - Union[ - datasets.ImageDataset, - datasets.TabularDataset, - datasets.TextDataset, - datasets.VideoDataset, - ] - ): - Vertex AI to fit this training against. Custom training script should - retrieve datasets through passed in environment variables uris: + dataset (datasets.TimeSeriesDataset): + Required. The dataset within the same Project from which data will be used to train the Model. The + Dataset must use schema compatible with Model being trained, + and what is compatible should be described in the used + TrainingPipeline's [training_task_definition] + [google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]. + For time series Datasets, all their data is exported to + training, to pick and choose from. + target_column (str): + Required. Name of the column that the Model is to predict values for. This + column must be unavailable at forecast. + time_column (str): + Required. Name of the column that identifies time order in the time series. + This column must be available at forecast. + time_series_identifier_column (str): + Required. Name of the column that identifies the time series. + unavailable_at_forecast_columns (List[str]): + Required. Column names of columns that are unavailable at forecast. + Each column contains information for the given entity (identified by the + [time_series_identifier_column]) that is unknown before the forecast + (e.g. population of a city in a given year, or weather on a given day). + available_at_forecast_columns (List[str]): + Required. Column names of columns that are available at forecast. + Each column contains information for the given entity (identified by the + [time_series_identifier_column]) that is known at forecast. + forecast_horizon: (int): + Required. The amount of time into the future for which forecasted values for the target are + returned. Expressed in number of units defined by the [data_granularity_unit] and + [data_granularity_count] field. Inclusive. + data_granularity_unit (str): + Required. The data granularity unit. Accepted values are ``minute``, + ``hour``, ``day``, ``week``, ``month``, ``year``. + data_granularity_count (int): + Required. The number of data granularity units between data points in the training + data. If [data_granularity_unit] is `minute`, can be 1, 5, 10, 15, or 30. For all other + values of [data_granularity_unit], must be 1. + predefined_split_column_name (str): + Optional. The key is a name of one of the Dataset's data + columns. The value of the key (either the label's value or + value in the column) must be one of {``TRAIN``, + ``VALIDATE``, ``TEST``}, and it defines to which set the + given piece of data is assigned. If for a piece of data the + key is not present or has an invalid value, that piece is + ignored by the pipeline. - os.environ["AIP_TRAINING_DATA_URI"] - os.environ["AIP_VALIDATION_DATA_URI"] - os.environ["AIP_TEST_DATA_URI"] + Supported only for tabular and time series Datasets. + timestamp_split_column_name (str): + Optional. The key is a name of one of the Dataset's data + columns. The value of the key values of the key (the values in + the column) must be in RFC 3339 `date-time` format, where + `time-offset` = `"Z"` (e.g. 1985-04-12T23:20:50.52Z). If for a + piece of data the key is not present or has an invalid value, + that piece is ignored by the pipeline. + Supported only for tabular and time series Datasets. + This parameter must be used with training_fraction_split, + validation_fraction_split, and test_fraction_split. + weight_column (str): + Optional. Name of the column that should be used as the weight column. + Higher values in this column give more importance to the row + during Model training. The column must have numeric values between 0 and + 10000 inclusively, and 0 value means that the row is ignored. + If the weight column field is not set, then all rows are assumed to have + equal weight of 1. This column must be available at forecast. + time_series_attribute_columns (List[str]): + Optional. Column names that should be used as attribute columns. + Each column is constant within a time series. + context_window (int): + Optional. The amount of time into the past training and prediction data is used for + model training and prediction respectively. Expressed in number of units defined by the + [data_granularity_unit] and [data_granularity_count] fields. When not provided uses the + default value of 0 which means the model sets each series context window to be 0 (also + known as "cold start"). Inclusive. + export_evaluated_data_items (bool): + Whether to export the test set predictions to a BigQuery table. + If False, then the export is not performed. + export_evaluated_data_items_bigquery_destination_uri (string): + Optional. URI of desired destination BigQuery table for exported test set predictions. - Additionally the dataset format is passed in as: + Expected format: + ``bq://::`` - os.environ["AIP_DATA_FORMAT"] - annotation_schema_uri (str): - Google Cloud Storage URI points to a YAML file describing - annotation schema. The schema is defined as an OpenAPI 3.0.2 - [Schema Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schema-object) The schema files - that can be used here are found in - gs://google-cloud-aiplatform/schema/dataset/annotation/, - note that the chosen schema must be consistent with - ``metadata`` - of the Dataset specified by - ``dataset_id``. + If not specified, then results are exported to the following auto-created BigQuery + table: + ``:export_evaluated_examples__.evaluated_examples`` - Only Annotations that both match this schema and belong to - DataItems not ignored by the split method are used in - respectively training, validation or test role, depending on - the role of the DataItem they are on. + Applies only if [export_evaluated_data_items] is True. + export_evaluated_data_items_override_destination (bool): + Whether to override the contents of [export_evaluated_data_items_bigquery_destination_uri], + if the table exists, for exported test set predictions. If False, and the + table exists, then the training job will fail. - When used in conjunction with - ``annotations_filter``, - the Annotations used for training are filtered by both - ``annotations_filter`` - and - ``annotation_schema_uri``. + Applies only if [export_evaluated_data_items] is True and + [export_evaluated_data_items_bigquery_destination_uri] is specified. + quantiles (List[float]): + Quantiles to use for the `minimize-quantile-loss` + [AutoMLForecastingTrainingJob.optimization_objective]. This argument is required in + this case. + + Accepts up to 5 quantiles in the form of a double from 0 to 1, exclusive. + Each quantile must be unique. + validation_options (str): + Validation options for the data validation component. The available options are: + "fail-pipeline" - (default), will validate against the validation and fail the pipeline + if it fails. + "ignore-validation" - ignore the results of the validation and continue the pipeline + budget_milli_node_hours (int): + Optional. The train budget of creating this Model, expressed in milli node + hours i.e. 1,000 value in this field means 1 node hour. + The training cost of the model will not exceed this budget. The final + cost will be attempted to be close to the budget, though may end up + being (even) noticeably smaller - at the backend's discretion. This + especially may happen when further model training ceases to provide + any improvements. + If the budget is set to a value known to be insufficient to train a + Model for the given training set, the training won't be attempted and + will error. + The minimum value is 1000 and the maximum is 72000. model_display_name (str): - If the script produces a managed Vertex AI Model. The display name of + Optional. If the script produces a managed Vertex AI Model. The display name of the Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. @@ -1938,345 +1892,229 @@ def run( are allowed. See https://goo.gl/xmQnxf for more information and examples of labels. - base_output_dir (str): - GCS output directory of job. If not provided a - timestamped directory in the staging directory will be used. - - Vertex AI sets the following environment variables when it runs your training code: - - - AIP_MODEL_DIR: a Cloud Storage URI of a directory intended for saving model artifacts, i.e. /model/ - - AIP_CHECKPOINT_DIR: a Cloud Storage URI of a directory intended for saving checkpoints, i.e. /checkpoints/ - - AIP_TENSORBOARD_LOG_DIR: a Cloud Storage URI of a directory intended for saving TensorBoard logs, i.e. /logs/ - - service_account (str): - Specifies the service account for workload run-as account. - Users submitting jobs must have act-as permission on this run-as account. - network (str): - The full name of the Compute Engine network to which the job - should be peered. For example, projects/12345/global/networks/myVPC. - Private services access must already be configured for the network. - If left unspecified, the job is not peered with any network. - bigquery_destination (str): - Provide this field if `dataset` is a BiqQuery dataset. - The BigQuery project location where the training data is to - be written to. In the given project a new dataset is created - with name - ``dataset___`` - where timestamp is in YYYY_MM_DDThh_mm_ss_sssZ format. All - training input data will be written into that dataset. In - the dataset three tables will be created, ``training``, - ``validation`` and ``test``. - - - AIP_DATA_FORMAT = "bigquery". - - AIP_TRAINING_DATA_URI ="bigquery_destination.dataset_*.training" - - AIP_VALIDATION_DATA_URI = "bigquery_destination.dataset_*.validation" - - AIP_TEST_DATA_URI = "bigquery_destination.dataset_*.test" - args (List[Unions[str, int, float]]): - Command line arguments to be passed to the Python script. - environment_variables (Dict[str, str]): - Environment variables to be passed to the container. - Should be a dictionary where keys are environment variable names - and values are environment variable values for those names. - At most 10 environment variables can be specified. - The Name of the environment variable must be unique. - - environment_variables = { - 'MY_KEY': 'MY_VALUE' - } - replica_count (int): - The number of worker replicas. If replica count = 1 then one chief - replica will be provisioned. If replica_count > 1 the remainder will be - provisioned as a worker replica pool. - machine_type (str): - The type of machine to use for training. - accelerator_type (str): - Hardware accelerator type. One of ACCELERATOR_TYPE_UNSPECIFIED, - NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, - NVIDIA_TESLA_T4 - accelerator_count (int): - The number of accelerators to attach to a worker replica. - boot_disk_type (str): - Type of the boot disk, default is `pd-ssd`. - Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or - `pd-standard` (Persistent Disk Hard Disk Drive). - boot_disk_size_gb (int): - Size in GB of the boot disk, default is 100GB. - boot disk size must be within the range of [100, 64000]. - reduction_server_replica_count (int): - The number of reduction server replicas, default is 0. - reduction_server_machine_type (str): - Optional. The type of machine to use for reduction server. - reduction_server_container_uri (str): - Optional. The Uri of the reduction server container image. - See details: https://cloud.google.com/vertex-ai/docs/training/distributed-training#reduce_training_time_with_reduction_server - training_fraction_split (float): - Optional. The fraction of the input data that is to be used to train - the Model. This is ignored if Dataset is not provided. - validation_fraction_split (float): - Optional. The fraction of the input data that is to be used to validate - the Model. This is ignored if Dataset is not provided. - test_fraction_split (float): - Optional. The fraction of the input data that is to be used to evaluate - the Model. This is ignored if Dataset is not provided. - training_filter_split (str): - Optional. A filter on DataItems of the Dataset. DataItems that match - this filter are used to train the Model. A filter with same syntax - as the one used in DatasetService.ListDataItems may be used. If a - single DataItem is matched by more than one of the FilterSplit filters, - then it is assigned to the first set that applies to it in the training, - validation, test order. This is ignored if Dataset is not provided. - validation_filter_split (str): - Optional. A filter on DataItems of the Dataset. DataItems that match - this filter are used to validate the Model. A filter with same syntax - as the one used in DatasetService.ListDataItems may be used. If a - single DataItem is matched by more than one of the FilterSplit filters, - then it is assigned to the first set that applies to it in the training, - validation, test order. This is ignored if Dataset is not provided. - test_filter_split (str): - Optional. A filter on DataItems of the Dataset. DataItems that match - this filter are used to test the Model. A filter with same syntax - as the one used in DatasetService.ListDataItems may be used. If a - single DataItem is matched by more than one of the FilterSplit filters, - then it is assigned to the first set that applies to it in the training, - validation, test order. This is ignored if Dataset is not provided. - predefined_split_column_name (str): - Optional. The key is a name of one of the Dataset's data - columns. The value of the key (either the label's value or - value in the column) must be one of {``training``, - ``validation``, ``test``}, and it defines to which set the - given piece of data is assigned. If for a piece of data the - key is not present or has an invalid value, that piece is - ignored by the pipeline. - - Supported only for tabular and time series Datasets. - timestamp_split_column_name (str): - Optional. The key is a name of one of the Dataset's data - columns. The value of the key values of the key (the values in - the column) must be in RFC 3339 `date-time` format, where - `time-offset` = `"Z"` (e.g. 1985-04-12T23:20:50.52Z). If for a - piece of data the key is not present or has an invalid value, - that piece is ignored by the pipeline. - - Supported only for tabular and time series Datasets. - timeout (int): - The maximum job running time in seconds. The default is 7 days. - restart_job_on_worker_restart (bool): - Restarts the entire CustomJob if a worker - gets restarted. This feature can be used by - distributed training jobs that are not resilient - to workers leaving and joining a job. - enable_web_access (bool): - Whether you want Vertex AI to enable interactive shell access - to training containers. - https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell - tensorboard (str): - Optional. The name of a Vertex AI - [Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard] - resource to which this CustomJob will upload Tensorboard - logs. Format: - ``projects/{project}/locations/{location}/tensorboards/{tensorboard}`` - - The training script should write Tensorboard to following Vertex AI environment - variable: - - AIP_TENSORBOARD_LOG_DIR - - `service_account` is required with provided `tensorboard`. - For more information on configuring your service account please visit: - https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training + additional_experiments (List[str]): + Optional. Additional experiment flags for the time series forcasting training. create_request_timeout (float): Optional. The timeout for the create request in seconds. + hierarchy_group_columns (List[str]): + Optional. A list of time series attribute column names that + define the time series hierarchy. Only one level of hierarchy is + supported, ex. ``region`` for a hierarchy of stores or + ``department`` for a hierarchy of products. If multiple columns + are specified, time series will be grouped by their combined + values, ex. (``blue``, ``large``) for ``color`` and ``size``, up + to 5 columns are accepted. If no group columns are specified, + all time series are considered to be part of the same group. + hierarchy_group_total_weight (float): + Optional. The weight of the loss for predictions aggregated over + time series in the same hierarchy group. + hierarchy_temporal_total_weight (float): + Optional. The weight of the loss for predictions aggregated over + the horizon for a single time series. + hierarchy_group_temporal_total_weight (float): + Optional. The weight of the loss for predictions aggregated over + both the horizon and time series in the same hierarchy group. + window_column (str): + Optional. Name of the column that should be used to filter input + rows. The column should contain either booleans or string + booleans; if the value of the row is True, generate a sliding + window from that row. + window_stride_length (int): + Optional. Step length used to generate input examples. Every + ``window_stride_length`` rows will be used to generate a sliding + window. + window_max_count (int): + Optional. Number of rows that should be used to generate input + examples. If the total row count is larger than this number, the + input data will be randomly sampled to hit the count. + holiday_regions (List[str]): + Optional. The geographical regions to use when creating holiday + features. This option is only allowed when data_granularity_unit + is ``day``. Acceptable values can come from any of the following + levels: + Top level: GLOBAL + Second level: continental regions + NA: North America + JAPAC: Japan and Asia Pacific + EMEA: Europe, the Middle East and Africa + LAC: Latin America and the Caribbean + Third level: countries from ISO 3166-1 Country codes. sync (bool): Whether to execute this method synchronously. If False, this method will be executed in concurrent Future and any downstream object will be immediately returned and synced when the Future has completed. - Returns: model: The trained Vertex AI Model resource or None if training did not produce a Vertex AI Model. + + Raises: + RuntimeError: If Training job has already been run or is waiting to run. """ - worker_pool_specs, managed_model = self._prepare_and_validate_run( - model_display_name=model_display_name, - model_labels=model_labels, - replica_count=replica_count, - machine_type=machine_type, - accelerator_count=accelerator_count, - accelerator_type=accelerator_type, - boot_disk_type=boot_disk_type, - boot_disk_size_gb=boot_disk_size_gb, - reduction_server_replica_count=reduction_server_replica_count, - reduction_server_machine_type=reduction_server_machine_type, - ) - # make and copy package - python_packager = source_utils._TrainingScriptPythonPackager( - script_path=self._script_path, requirements=self._requirements - ) + if model_display_name: + utils.validate_display_name(model_display_name) + if model_labels: + utils.validate_labels(model_labels) + + if self._is_waiting_to_run(): + raise RuntimeError( + f"{self._model_type} Forecasting Training is already scheduled " + "to run." + ) + + if self._has_run: + raise RuntimeError( + f"{self._model_type} Forecasting Training has already run." + ) + + if additional_experiments: + self._add_additional_experiments(additional_experiments) return self._run( - python_packager=python_packager, dataset=dataset, - annotation_schema_uri=annotation_schema_uri, - worker_pool_specs=worker_pool_specs, - managed_model=managed_model, - args=args, - environment_variables=environment_variables, - base_output_dir=base_output_dir, - service_account=service_account, - network=network, - bigquery_destination=bigquery_destination, + target_column=target_column, + time_column=time_column, + time_series_identifier_column=time_series_identifier_column, + unavailable_at_forecast_columns=unavailable_at_forecast_columns, + available_at_forecast_columns=available_at_forecast_columns, + forecast_horizon=forecast_horizon, + data_granularity_unit=data_granularity_unit, + data_granularity_count=data_granularity_count, training_fraction_split=training_fraction_split, validation_fraction_split=validation_fraction_split, test_fraction_split=test_fraction_split, - training_filter_split=training_filter_split, - validation_filter_split=validation_filter_split, - test_filter_split=test_filter_split, predefined_split_column_name=predefined_split_column_name, timestamp_split_column_name=timestamp_split_column_name, - timeout=timeout, - restart_job_on_worker_restart=restart_job_on_worker_restart, - enable_web_access=enable_web_access, - tensorboard=tensorboard, - reduction_server_container_uri=reduction_server_container_uri - if reduction_server_replica_count > 0 - else None, + weight_column=weight_column, + time_series_attribute_columns=time_series_attribute_columns, + context_window=context_window, + budget_milli_node_hours=budget_milli_node_hours, + export_evaluated_data_items=export_evaluated_data_items, + export_evaluated_data_items_bigquery_destination_uri=export_evaluated_data_items_bigquery_destination_uri, + export_evaluated_data_items_override_destination=export_evaluated_data_items_override_destination, + quantiles=quantiles, + validation_options=validation_options, + model_display_name=model_display_name, + model_labels=model_labels, + hierarchy_group_columns=hierarchy_group_columns, + hierarchy_group_total_weight=hierarchy_group_total_weight, + hierarchy_temporal_total_weight=hierarchy_temporal_total_weight, + hierarchy_group_temporal_total_weight=hierarchy_group_temporal_total_weight, + window_column=window_column, + window_stride_length=window_stride_length, + window_max_count=window_max_count, + holiday_regions=holiday_regions, sync=sync, create_request_timeout=create_request_timeout, ) - @base.optional_sync(construct_object_on_arg="managed_model") + @base.optional_sync() def _run( self, - python_packager: source_utils._TrainingScriptPythonPackager, - dataset: Optional[ - Union[ - datasets.ImageDataset, - datasets.TabularDataset, - datasets.TextDataset, - datasets.VideoDataset, - ] - ], - annotation_schema_uri: Optional[str], - worker_pool_specs: worker_spec_utils._DistributedTrainingSpec, - managed_model: Optional[gca_model.Model] = None, - args: Optional[List[Union[str, float, int]]] = None, - environment_variables: Optional[Dict[str, str]] = None, - base_output_dir: Optional[str] = None, - service_account: Optional[str] = None, - network: Optional[str] = None, - bigquery_destination: Optional[str] = None, + dataset: datasets.TimeSeriesDataset, + target_column: str, + time_column: str, + time_series_identifier_column: str, + unavailable_at_forecast_columns: List[str], + available_at_forecast_columns: List[str], + forecast_horizon: int, + data_granularity_unit: str, + data_granularity_count: int, training_fraction_split: Optional[float] = None, validation_fraction_split: Optional[float] = None, test_fraction_split: Optional[float] = None, - training_filter_split: Optional[str] = None, - validation_filter_split: Optional[str] = None, - test_filter_split: Optional[str] = None, predefined_split_column_name: Optional[str] = None, timestamp_split_column_name: Optional[str] = None, - timeout: Optional[int] = None, - restart_job_on_worker_restart: bool = False, - enable_web_access: bool = False, - tensorboard: Optional[str] = None, - reduction_server_container_uri: Optional[str] = None, - sync=True, + weight_column: Optional[str] = None, + time_series_attribute_columns: Optional[List[str]] = None, + context_window: Optional[int] = None, + export_evaluated_data_items: bool = False, + export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None, + export_evaluated_data_items_override_destination: bool = False, + quantiles: Optional[List[float]] = None, + validation_options: Optional[str] = None, + budget_milli_node_hours: int = 1000, + model_display_name: Optional[str] = None, + model_labels: Optional[Dict[str, str]] = None, + hierarchy_group_columns: Optional[List[str]] = None, + hierarchy_group_total_weight: Optional[float] = None, + hierarchy_temporal_total_weight: Optional[float] = None, + hierarchy_group_temporal_total_weight: Optional[float] = None, + window_column: Optional[str] = None, + window_stride_length: Optional[int] = None, + window_max_count: Optional[int] = None, + holiday_regions: Optional[List[str]] = None, + sync: bool = True, create_request_timeout: Optional[float] = None, - ) -> Optional[models.Model]: - """Packages local script and launches training_job. - - Args: - python_packager (source_utils._TrainingScriptPythonPackager): - Required. Python Packager pointing to training script locally. - dataset ( - Union[ - datasets.ImageDataset, - datasets.TabularDataset, - datasets.TextDataset, - datasets.VideoDataset, - ] - ): - Vertex AI to fit this training against. - annotation_schema_uri (str): - Google Cloud Storage URI points to a YAML file describing - annotation schema. - worker_pools_spec (worker_spec_utils._DistributedTrainingSpec): - Worker pools pecs required to run job. - managed_model (gca_model.Model): - Model proto if this script produces a Managed Model. - args (List[Unions[str, int, float]]): - Command line arguments to be passed to the Python script. - environment_variables (Dict[str, str]): - Environment variables to be passed to the container. - Should be a dictionary where keys are environment variable names - and values are environment variable values for those names. - At most 10 environment variables can be specified. - The Name of the environment variable must be unique. - - environment_variables = { - 'MY_KEY': 'MY_VALUE' - } - base_output_dir (str): - GCS output directory of job. If not provided a - timestamped directory in the staging directory will be used. + ) -> models.Model: + """Runs the training job and returns a model. - Vertex AI sets the following environment variables when it runs your training code: + If training on a Vertex AI dataset, you can use one of the following split configurations: + Data fraction splits: + Any of ``training_fraction_split``, ``validation_fraction_split`` and + ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If + the provided ones sum to less than 1, the remainder is assigned to sets as + decided by Vertex AI. If none of the fractions are set, by default roughly 80% + of data will be used for training, 10% for validation, and 10% for test. - - AIP_MODEL_DIR: a Cloud Storage URI of a directory intended for saving model artifacts, i.e. /model/ - - AIP_CHECKPOINT_DIR: a Cloud Storage URI of a directory intended for saving checkpoints, i.e. /checkpoints/ - - AIP_TENSORBOARD_LOG_DIR: a Cloud Storage URI of a directory intended for saving TensorBoard logs, i.e. /logs/ + Predefined splits: + Assigns input data to training, validation, and test sets based on the value of a provided key. + If using predefined splits, ``predefined_split_column_name`` must be provided. + Supported only for tabular Datasets. - service_account (str): - Specifies the service account for workload run-as account. - Users submitting jobs must have act-as permission on this run-as account. - network (str): - The full name of the Compute Engine network to which the job - should be peered. For example, projects/12345/global/networks/myVPC. - Private services access must already be configured for the network. - If left unspecified, the job is not peered with any network. - bigquery_destination (str): - Provide this field if `dataset` is a BiqQuery dataset. - The BigQuery project location where the training data is to - be written to. In the given project a new dataset is created - with name - ``dataset___`` - where timestamp is in YYYY_MM_DDThh_mm_ss_sssZ format. All - training input data will be written into that dataset. In - the dataset three tables will be created, ``training``, - ``validation`` and ``test``. + Timestamp splits: + Assigns input data to training, validation, and test sets + based on a provided timestamps. The youngest data pieces are + assigned to training set, next to validation set, and the oldest + to the test set. + Supported only for tabular Datasets. - - AIP_DATA_FORMAT = "bigquery". - - AIP_TRAINING_DATA_URI ="bigquery_destination.dataset_*.training" - - AIP_VALIDATION_DATA_URI = "bigquery_destination.dataset_*.validation" - - AIP_TEST_DATA_URI = "bigquery_destination.dataset_*.test" - training_fraction_split (float): - Optional. The fraction of the input data that is to be used to train - the Model. This is ignored if Dataset is not provided. - validation_fraction_split (float): - Optional. The fraction of the input data that is to be used to validate - the Model. This is ignored if Dataset is not provided. - test_fraction_split (float): - Optional. The fraction of the input data that is to be used to evaluate - the Model. This is ignored if Dataset is not provided. - training_filter_split (str): - Optional. A filter on DataItems of the Dataset. DataItems that match - this filter are used to train the Model. A filter with same syntax - as the one used in DatasetService.ListDataItems may be used. If a - single DataItem is matched by more than one of the FilterSplit filters, - then it is assigned to the first set that applies to it in the training, - validation, test order. This is ignored if Dataset is not provided. - validation_filter_split (str): - Optional. A filter on DataItems of the Dataset. DataItems that match - this filter are used to validate the Model. A filter with same syntax - as the one used in DatasetService.ListDataItems may be used. If a - single DataItem is matched by more than one of the FilterSplit filters, - then it is assigned to the first set that applies to it in the training, - validation, test order. This is ignored if Dataset is not provided. - test_filter_split (str): - Optional. A filter on DataItems of the Dataset. DataItems that match - this filter are used to test the Model. A filter with same syntax - as the one used in DatasetService.ListDataItems may be used. If a - single DataItem is matched by more than one of the FilterSplit filters, - then it is assigned to the first set that applies to it in the training, - validation, test order. This is ignored if Dataset is not provided. + Args: + dataset (datasets.TimeSeriesDataset): + Required. The dataset within the same Project from which data will be used to train the Model. The + Dataset must use schema compatible with Model being trained, + and what is compatible should be described in the used + TrainingPipeline's [training_task_definition] + [google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]. + For time series Datasets, all their data is exported to + training, to pick and choose from. + target_column (str): + Required. Name of the column that the Model is to predict values for. This + column must be unavailable at forecast. + time_column (str): + Required. Name of the column that identifies time order in the time series. + This column must be available at forecast. + time_series_identifier_column (str): + Required. Name of the column that identifies the time series. + unavailable_at_forecast_columns (List[str]): + Required. Column names of columns that are unavailable at forecast. + Each column contains information for the given entity (identified by the + [time_series_identifier_column]) that is unknown before the forecast + (e.g. population of a city in a given year, or weather on a given day). + available_at_forecast_columns (List[str]): + Required. Column names of columns that are available at forecast. + Each column contains information for the given entity (identified by the + [time_series_identifier_column]) that is known at forecast. + forecast_horizon: (int): + Required. The amount of time into the future for which forecasted values for the target are + returned. Expressed in number of units defined by the [data_granularity_unit] and + [data_granularity_count] field. Inclusive. + data_granularity_unit (str): + Required. The data granularity unit. Accepted values are ``minute``, + ``hour``, ``day``, ``week``, ``month``, ``year``. + data_granularity_count (int): + Required. The number of data granularity units between data points in the training + data. If [data_granularity_unit] is `minute`, can be 1, 5, 10, 15, or 30. For all other + values of [data_granularity_unit], must be 1. + training_fraction_split (float): + Optional. The fraction of the input data that is to be used to train + the Model. This is ignored if Dataset is not provided. + validation_fraction_split (float): + Optional. The fraction of the input data that is to be used to validate + the Model. This is ignored if Dataset is not provided. + test_fraction_split (float): + Optional. The fraction of the input data that is to be used to evaluate + the Model. This is ignored if Dataset is not provided. predefined_split_column_name (str): Optional. The key is a name of one of the Dataset's data columns. The value of the key (either the label's value or @@ -2294,127 +2132,319 @@ def _run( `time-offset` = `"Z"` (e.g. 1985-04-12T23:20:50.52Z). If for a piece of data the key is not present or has an invalid value, that piece is ignored by the pipeline. - Supported only for tabular and time series Datasets. - timeout (int): - The maximum job running time in seconds. The default is 7 days. - restart_job_on_worker_restart (bool): - Restarts the entire CustomJob if a worker - gets restarted. This feature can be used by - distributed training jobs that are not resilient - to workers leaving and joining a job. - enable_web_access (bool): - Whether you want Vertex AI to enable interactive shell access - to training containers. - https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell - tensorboard (str): - Optional. The name of a Vertex AI - [Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard] - resource to which this CustomJob will upload Tensorboard - logs. Format: - ``projects/{project}/locations/{location}/tensorboards/{tensorboard}`` + This parameter must be used with training_fraction_split, + validation_fraction_split, and test_fraction_split. + weight_column (str): + Optional. Name of the column that should be used as the weight column. + Higher values in this column give more importance to the row + during Model training. The column must have numeric values between 0 and + 10000 inclusively, and 0 value means that the row is ignored. + If the weight column field is not set, then all rows are assumed to have + equal weight of 1. This column must be available at forecast. + time_series_attribute_columns (List[str]): + Optional. Column names that should be used as attribute columns. + Each column is constant within a time series. + context_window (int): + Optional. The amount of time into the past training and prediction data is used for + model training and prediction respectively. Expressed in number of units defined by the + [data_granularity_unit] and [data_granularity_count] fields. When not provided uses the + default value of 0 which means the model sets each series context window to be 0 (also + known as "cold start"). Inclusive. + export_evaluated_data_items (bool): + Whether to export the test set predictions to a BigQuery table. + If False, then the export is not performed. + export_evaluated_data_items_bigquery_destination_uri (string): + Optional. URI of desired destination BigQuery table for exported test set predictions. - The training script should write Tensorboard to following Vertex AI environment - variable: + Expected format: + ``bq://::
`` - AIP_TENSORBOARD_LOG_DIR + If not specified, then results are exported to the following auto-created BigQuery + table: + ``:export_evaluated_examples__.evaluated_examples`` - `service_account` is required with provided `tensorboard`. - For more information on configuring your service account please visit: - https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training - reduction_server_container_uri (str): - Optional. The Uri of the reduction server container image. + Applies only if [export_evaluated_data_items] is True. + export_evaluated_data_items_override_destination (bool): + Whether to override the contents of [export_evaluated_data_items_bigquery_destination_uri], + if the table exists, for exported test set predictions. If False, and the + table exists, then the training job will fail. + + Applies only if [export_evaluated_data_items] is True and + [export_evaluated_data_items_bigquery_destination_uri] is specified. + quantiles (List[float]): + Quantiles to use for the `minimize-quantile-loss` + [AutoMLForecastingTrainingJob.optimization_objective]. This argument is required in + this case. + + Accepts up to 5 quantiles in the form of a double from 0 to 1, exclusive. + Each quantile must be unique. + validation_options (str): + Validation options for the data validation component. The available options are: + "fail-pipeline" - (default), will validate against the validation and fail the pipeline + if it fails. + "ignore-validation" - ignore the results of the validation and continue the pipeline + budget_milli_node_hours (int): + Optional. The train budget of creating this Model, expressed in milli node + hours i.e. 1,000 value in this field means 1 node hour. + The training cost of the model will not exceed this budget. The final + cost will be attempted to be close to the budget, though may end up + being (even) noticeably smaller - at the backend's discretion. This + especially may happen when further model training ceases to provide + any improvements. + If the budget is set to a value known to be insufficient to train a + Model for the given training set, the training won't be attempted and + will error. + The minimum value is 1000 and the maximum is 72000. + model_display_name (str): + Optional. If the script produces a managed Vertex AI Model. The display name of + the Model. The name can be up to 128 characters long and can be consist + of any UTF-8 characters. + + If not provided upon creation, the job's display_name is used. + model_labels (Dict[str, str]): + Optional. The labels with user-defined metadata to + organize your Models. + Label keys and values can be no longer than 64 + characters (Unicode codepoints), can only + contain lowercase letters, numeric characters, + underscores and dashes. International characters + are allowed. + See https://goo.gl/xmQnxf for more information + and examples of labels. + hierarchy_group_columns (List[str]): + Optional. A list of time series attribute column names that + define the time series hierarchy. Only one level of hierarchy is + supported, ex. ``region`` for a hierarchy of stores or + ``department`` for a hierarchy of products. If multiple columns + are specified, time series will be grouped by their combined + values, ex. (``blue``, ``large``) for ``color`` and ``size``, up + to 5 columns are accepted. If no group columns are specified, + all time series are considered to be part of the same group. + hierarchy_group_total_weight (float): + Optional. The weight of the loss for predictions aggregated over + time series in the same hierarchy group. + hierarchy_temporal_total_weight (float): + Optional. The weight of the loss for predictions aggregated over + the horizon for a single time series. + hierarchy_group_temporal_total_weight (float): + Optional. The weight of the loss for predictions aggregated over + both the horizon and time series in the same hierarchy group. + window_column (str): + Optional. Name of the column that should be used to filter input + rows. The column should contain either booleans or string + booleans; if the value of the row is True, generate a sliding + window from that row. + window_stride_length (int): + Optional. Step length used to generate input examples. Every + ``window_stride_length`` rows will be used to generate a sliding + window. + window_max_count (int): + Optional. Number of rows that should be used to generate input + examples. If the total row count is larger than this number, the + input data will be randomly sampled to hit the count. + holiday_regions (List[str]): + Optional. The geographical regions to use when creating holiday + features. This option is only allowed when data_granularity_unit + is ``day``. Acceptable values can come from any of the following + levels: + Top level: GLOBAL + Second level: continental regions + NA: North America + JAPAC: Japan and Asia Pacific + EMEA: Europe, the Middle East and Africa + LAC: Latin America and the Caribbean + Third level: countries from ISO 3166-1 Country codes. sync (bool): Whether to execute this method synchronously. If False, this method will be executed in concurrent Future and any downstream object will be immediately returned and synced when the Future has completed. - create_request_timeout (float) - Optional. The timeout for the create request in seconds - + create_request_timeout (float): + Optional. The timeout for the create request in seconds. Returns: model: The trained Vertex AI Model resource or None if training did not produce a Vertex AI Model. """ - package_gcs_uri = python_packager.package_and_copy_to_gcs( - gcs_staging_dir=self._staging_bucket, - project=self.project, - credentials=self.credentials, - ) + # auto-populate transformations + if self._column_transformations is None: + _LOGGER.info( + "No column transformations provided, so now retrieving columns from dataset in order to set default column transformations." + ) - for spec_order, spec in enumerate(worker_pool_specs): + ( + self._column_transformations, + column_names, + ) = dataset._get_default_column_transformations(target_column) - if not spec: - continue + _LOGGER.info( + "The column transformation of type 'auto' was set for the following columns: %s." + % column_names + ) - if ( - spec_order == worker_spec_utils._SPEC_ORDERS["server_spec"] - and reduction_server_container_uri - ): - spec["container_spec"] = { - "image_uri": reduction_server_container_uri, - } - else: - spec["python_package_spec"] = { - "executor_image_uri": self._container_uri, - "python_module": python_packager.module_name, - "package_uris": [package_gcs_uri], - } + window_config = self._create_window_config( + column=window_column, + stride_length=window_stride_length, + max_count=window_max_count, + ) - if args: - spec["python_package_spec"]["args"] = args + training_task_inputs_dict = { + # required inputs + "targetColumn": target_column, + "timeColumn": time_column, + "timeSeriesIdentifierColumn": time_series_identifier_column, + "timeSeriesAttributeColumns": time_series_attribute_columns, + "unavailableAtForecastColumns": unavailable_at_forecast_columns, + "availableAtForecastColumns": available_at_forecast_columns, + "forecastHorizon": forecast_horizon, + "dataGranularity": { + "unit": data_granularity_unit, + "quantity": data_granularity_count, + }, + "transformations": self._column_transformations, + "trainBudgetMilliNodeHours": budget_milli_node_hours, + # optional inputs + "weightColumn": weight_column, + "contextWindow": context_window, + "quantiles": quantiles, + "validationOptions": validation_options, + "optimizationObjective": self._optimization_objective, + "holidayRegions": holiday_regions, + } - if environment_variables: - spec["python_package_spec"]["env"] = [ - {"name": key, "value": value} - for key, value in environment_variables.items() - ] + # TODO(TheMichaelHu): Remove the ifs once the API supports these inputs. + if any( + [ + hierarchy_group_columns, + hierarchy_group_total_weight, + hierarchy_temporal_total_weight, + hierarchy_group_temporal_total_weight, + ] + ): + training_task_inputs_dict["hierarchyConfig"] = { + "groupColumns": hierarchy_group_columns, + "groupTotalWeight": hierarchy_group_total_weight, + "temporalTotalWeight": hierarchy_temporal_total_weight, + "groupTemporalTotalWeight": hierarchy_group_temporal_total_weight, + } + if window_config: + training_task_inputs_dict["windowConfig"] = window_config + + final_export_eval_bq_uri = export_evaluated_data_items_bigquery_destination_uri + if final_export_eval_bq_uri and not final_export_eval_bq_uri.startswith( + "bq://" + ): + final_export_eval_bq_uri = f"bq://{final_export_eval_bq_uri}" + + if export_evaluated_data_items: + training_task_inputs_dict["exportEvaluatedDataItemsConfig"] = { + "destinationBigqueryUri": final_export_eval_bq_uri, + "overrideExistingTable": export_evaluated_data_items_override_destination, + } - ( - training_task_inputs, - base_output_dir, - ) = self._prepare_training_task_inputs_and_output_dir( - worker_pool_specs=worker_pool_specs, - base_output_dir=base_output_dir, - service_account=service_account, - network=network, - timeout=timeout, - restart_job_on_worker_restart=restart_job_on_worker_restart, - enable_web_access=enable_web_access, - tensorboard=tensorboard, + if self._additional_experiments: + training_task_inputs_dict[ + "additionalExperiments" + ] = self._additional_experiments + + model = gca_model.Model( + display_name=model_display_name or self._display_name, + labels=model_labels or self._labels, + encryption_spec=self._model_encryption_spec, ) - model = self._run_job( - training_task_definition=schema.training_job.definition.custom_task, - training_task_inputs=training_task_inputs, + new_model = self._run_job( + training_task_definition=self._training_task_definition, + training_task_inputs=training_task_inputs_dict, dataset=dataset, - annotation_schema_uri=annotation_schema_uri, training_fraction_split=training_fraction_split, validation_fraction_split=validation_fraction_split, test_fraction_split=test_fraction_split, - training_filter_split=training_filter_split, - validation_filter_split=validation_filter_split, - test_filter_split=test_filter_split, predefined_split_column_name=predefined_split_column_name, timestamp_split_column_name=timestamp_split_column_name, - model=managed_model, - gcs_destination_uri_prefix=base_output_dir, - bigquery_destination=bigquery_destination, + model=model, create_request_timeout=create_request_timeout, ) - return model + if export_evaluated_data_items: + _LOGGER.info( + "Exported examples available at:\n%s" + % self.evaluated_data_items_bigquery_uri + ) + return new_model -class CustomContainerTrainingJob(_CustomTrainingJob): - """Class to launch a Custom Training Job in Vertex AI using a - Container.""" + @property + def _model_upload_fail_string(self) -> str: + """Helper property for model upload failure.""" + return ( + f"Training Pipeline {self.resource_name} is not configured to upload a " + "Model." + ) + + @property + def evaluated_data_items_bigquery_uri(self) -> Optional[str]: + """BigQuery location of exported evaluated examples from the Training Job + Returns: + str: BigQuery uri for the exported evaluated examples if the export + feature is enabled for training. + None: If the export feature was not enabled for training. + """ + + self._assert_gca_resource_is_available() + + metadata = self._gca_resource.training_task_metadata + if metadata and "evaluatedDataItemsBigqueryUri" in metadata: + return metadata["evaluatedDataItemsBigqueryUri"] + + return None + + def _add_additional_experiments(self, additional_experiments: List[str]): + """Add experiment flags to the training job. + Args: + additional_experiments (List[str]): + Experiment flags that can enable some experimental training features. + """ + self._additional_experiments.extend(additional_experiments) + + @staticmethod + def _create_window_config( + column: Optional[str] = None, + stride_length: Optional[int] = None, + max_count: Optional[int] = None, + ) -> Optional[Dict[str, Union[int, str]]]: + """Creates a window config from training job arguments.""" + configs = { + "column": column, + "strideLength": stride_length, + "maxCount": max_count, + } + present_configs = {k: v for k, v in configs.items() if v is not None} + if not present_configs: + return None + if len(present_configs) > 1: + raise ValueError( + "More than one windowing strategy provided. Make sure only one " + "of window_column, window_stride_length, or window_max_count " + "is specified." + ) + return present_configs + + +# TODO(b/172368325) add scheduling, custom_job.Scheduling +class CustomTrainingJob(_CustomTrainingJob): + """Class to launch a Custom Training Job in Vertex AI using a script. + + Takes a training implementation as a python script and executes that + script in Cloud Vertex AI Training. + """ def __init__( self, # TODO(b/223262536): Make display_name parameter fully optional in next major release display_name: str, + script_path: str, container_uri: str, - command: Sequence[str] = None, + requirements: Optional[Sequence[str]] = None, model_serving_container_image_uri: Optional[str] = None, model_serving_container_predict_route: Optional[str] = None, model_serving_container_health_route: Optional[str] = None, @@ -2434,12 +2464,13 @@ def __init__( model_encryption_spec_key_name: Optional[str] = None, staging_bucket: Optional[str] = None, ): - """Constructs a Custom Container Training Job. + """Constructs a Custom Training Job from a Python script. - job = aiplatform.CustomContainerTrainingJob( + job = aiplatform.CustomTrainingJob( display_name='test-train', - container_uri='gcr.io/my_project_id/my_image_name:tag', - command=['python3', 'run_script.py'] + script_path='test_script.py', + requirements=['pandas', 'numpy'], + container_uri='gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest', model_serving_container_image_uri='gcr.io/my-trainer/serving:1', model_serving_container_predict_route='predict', model_serving_container_health_route='metadata, @@ -2471,11 +2502,11 @@ def __init__( Args: display_name (str): Required. The user-defined name of this TrainingPipeline. + script_path (str): Required. Local path to training script. container_uri (str): Required: Uri of the training container image in the GCR. - command (Sequence[str]): - The command to be invoked when the container is started. - It overrides the entrypoint instruction in Dockerfile when provided + requirements (Sequence[str]): + List of python packages dependencies of script. model_serving_container_image_uri (str): If the training produces a managed Vertex AI Model, the URI of the Model serving container suitable for serving the model produced by the @@ -2634,7 +2665,8 @@ def __init__( staging_bucket=staging_bucket, ) - self._command = command + self._requirements = requirements + self._script_path = script_path def run( self, @@ -2719,7 +2751,14 @@ def run( Supported only for tabular Datasets. Args: - dataset (Union[datasets.ImageDataset,datasets.TabularDataset,datasets.TextDataset,datasets.VideoDataset]): + dataset ( + Union[ + datasets.ImageDataset, + datasets.TabularDataset, + datasets.TextDataset, + datasets.VideoDataset, + ] + ): Vertex AI to fit this training against. Custom training script should retrieve datasets through passed in environment variables uris: @@ -2914,21 +2953,16 @@ def run( `service_account` is required with provided `tensorboard`. For more information on configuring your service account please visit: https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training + create_request_timeout (float): + Optional. The timeout for the create request in seconds. sync (bool): Whether to execute this method synchronously. If False, this method will be executed in concurrent Future and any downstream object will be immediately returned and synced when the Future has completed. - create_request_timeout (float): - Optional. The timeout for the create request in seconds. Returns: model: The trained Vertex AI Model resource or None if training did not produce a Vertex AI Model. - - Raises: - RuntimeError: If Training job has already been run, staging_bucket has not - been set, or model_display_name was provided but required arguments - were not provided in constructor. """ worker_pool_specs, managed_model = self._prepare_and_validate_run( model_display_name=model_display_name, @@ -2943,7 +2977,13 @@ def run( reduction_server_machine_type=reduction_server_machine_type, ) + # make and copy package + python_packager = source_utils._TrainingScriptPythonPackager( + script_path=self._script_path, requirements=self._requirements + ) + return self._run( + python_packager=python_packager, dataset=dataset, annotation_schema_uri=annotation_schema_uri, worker_pool_specs=worker_pool_specs, @@ -2976,6 +3016,7 @@ def run( @base.optional_sync(construct_object_on_arg="managed_model") def _run( self, + python_packager: source_utils._TrainingScriptPythonPackager, dataset: Optional[ Union[ datasets.ImageDataset, @@ -3010,7 +3051,10 @@ def _run( create_request_timeout: Optional[float] = None, ) -> Optional[models.Model]: """Packages local script and launches training_job. + Args: + python_packager (source_utils._TrainingScriptPythonPackager): + Required. Python Packager pointing to training script locally. dataset ( Union[ datasets.ImageDataset, @@ -3057,14 +3101,8 @@ def _run( should be peered. For example, projects/12345/global/networks/myVPC. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network. - timeout (int): - The maximum job running time in seconds. The default is 7 days. - restart_job_on_worker_restart (bool): - Restarts the entire CustomJob if a worker - gets restarted. This feature can be used by - distributed training jobs that are not resilient - to workers leaving and joining a job. bigquery_destination (str): + Provide this field if `dataset` is a BiqQuery dataset. The BigQuery project location where the training data is to be written to. In the given project a new dataset is created with name @@ -3127,6 +3165,13 @@ def _run( that piece is ignored by the pipeline. Supported only for tabular and time series Datasets. + timeout (int): + The maximum job running time in seconds. The default is 7 days. + restart_job_on_worker_restart (bool): + Restarts the entire CustomJob if a worker + gets restarted. This feature can be used by + distributed training jobs that are not resilient + to workers leaving and joining a job. enable_web_access (bool): Whether you want Vertex AI to enable interactive shell access to training containers. @@ -3152,13 +3197,18 @@ def _run( Whether to execute this method synchronously. If False, this method will be executed in concurrent Future and any downstream object will be immediately returned and synced when the Future has completed. - create_request_timeout (float): - Optional. The timeout for the create request in seconds. + create_request_timeout (float) + Optional. The timeout for the create request in seconds Returns: model: The trained Vertex AI Model resource or None if training did not produce a Vertex AI Model. """ + package_gcs_uri = python_packager.package_and_copy_to_gcs( + gcs_staging_dir=self._staging_bucket, + project=self.project, + credentials=self.credentials, + ) for spec_order, spec in enumerate(worker_pool_specs): @@ -3173,16 +3223,17 @@ def _run( "image_uri": reduction_server_container_uri, } else: - spec["containerSpec"] = {"imageUri": self._container_uri} - - if self._command: - spec["containerSpec"]["command"] = self._command + spec["python_package_spec"] = { + "executor_image_uri": self._container_uri, + "python_module": python_packager.module_name, + "package_uris": [package_gcs_uri], + } if args: - spec["containerSpec"]["args"] = args + spec["python_package_spec"]["args"] = args if environment_variables: - spec["containerSpec"]["env"] = [ + spec["python_package_spec"]["env"] = [ {"name": key, "value": value} for key, value in environment_variables.items() ] @@ -3223,125 +3274,170 @@ def _run( return model -class AutoMLTabularTrainingJob(_TrainingJob): - _supported_training_schemas = (schema.training_job.definition.automl_tabular,) +class CustomContainerTrainingJob(_CustomTrainingJob): + """Class to launch a Custom Training Job in Vertex AI using a + Container.""" def __init__( self, # TODO(b/223262536): Make display_name parameter fully optional in next major release display_name: str, - optimization_prediction_type: str, - optimization_objective: Optional[str] = None, - column_specs: Optional[Dict[str, str]] = None, - column_transformations: Optional[List[Dict[str, Dict[str, str]]]] = None, - optimization_objective_recall_value: Optional[float] = None, - optimization_objective_precision_value: Optional[float] = None, + container_uri: str, + command: Sequence[str] = None, + model_serving_container_image_uri: Optional[str] = None, + model_serving_container_predict_route: Optional[str] = None, + model_serving_container_health_route: Optional[str] = None, + model_serving_container_command: Optional[Sequence[str]] = None, + model_serving_container_args: Optional[Sequence[str]] = None, + model_serving_container_environment_variables: Optional[Dict[str, str]] = None, + model_serving_container_ports: Optional[Sequence[int]] = None, + model_description: Optional[str] = None, + model_instance_schema_uri: Optional[str] = None, + model_parameters_schema_uri: Optional[str] = None, + model_prediction_schema_uri: Optional[str] = None, project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, labels: Optional[Dict[str, str]] = None, training_encryption_spec_key_name: Optional[str] = None, model_encryption_spec_key_name: Optional[str] = None, + staging_bucket: Optional[str] = None, ): - """Constructs a AutoML Tabular Training Job. - - Example usage: + """Constructs a Custom Container Training Job. - job = training_jobs.AutoMLTabularTrainingJob( - display_name="my_display_name", - optimization_prediction_type="classification", - optimization_objective="minimize-log-loss", - column_specs={"column_1": "auto", "column_2": "numeric"}, + job = aiplatform.CustomContainerTrainingJob( + display_name='test-train', + container_uri='gcr.io/my_project_id/my_image_name:tag', + command=['python3', 'run_script.py'] + model_serving_container_image_uri='gcr.io/my-trainer/serving:1', + model_serving_container_predict_route='predict', + model_serving_container_health_route='metadata, labels={'key': 'value'}, ) - Args: - display_name (str): - Required. The user-defined name of this TrainingPipeline. - optimization_prediction_type (str): - The type of prediction the Model is to produce. - "classification" - Predict one out of multiple target values is - picked for each row. - "regression" - Predict a value based on its relation to other values. - This type is available only to columns that contain - semantically numeric values, i.e. integers or floating - point number, even if stored as e.g. strings. + Usage with Dataset: - optimization_objective (str): - Optional. Objective function the Model is to be optimized towards. The training - task creates a Model that maximizes/minimizes the value of the objective - function over the validation set. + ds = aiplatform.TabularDataset( + 'projects/my-project/locations/us-central1/datasets/12345') - The supported optimization objectives depend on the prediction type, and - in the case of classification also the number of distinct values in the - target column (two distint values -> binary, 3 or more distinct values - -> multi class). - If the field is not set, the default objective function is used. + job.run( + ds, + replica_count=1, + model_display_name='my-trained-model', + model_labels={'key': 'value'}, + ) - Classification (binary): - "maximize-au-roc" (default) - Maximize the area under the receiver - operating characteristic (ROC) curve. - "minimize-log-loss" - Minimize log loss. - "maximize-au-prc" - Maximize the area under the precision-recall curve. - "maximize-precision-at-recall" - Maximize precision for a specified - recall value. - "maximize-recall-at-precision" - Maximize recall for a specified - precision value. + Usage without Dataset: - Classification (multi class): - "minimize-log-loss" (default) - Minimize log loss. + job.run(replica_count=1, model_display_name='my-trained-model) - Regression: - "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE). - "minimize-mae" - Minimize mean-absolute error (MAE). - "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE). - column_specs (Dict[str, str]): - Optional. Alternative to column_transformations where the keys of the dict - are column names and their respective values are one of - AutoMLTabularTrainingJob.column_data_types. - When creating transformation for BigQuery Struct column, the column - should be flattened using "." as the delimiter. Only columns with no child - should have a transformation. - If an input column has no transformations on it, such a column is - ignored by the training, except for the targetColumn, which should have - no transformations defined on. - Only one of column_transformations or column_specs should be passed. If none - of column_transformations or column_specs is passed, the local credentials - being used will try setting column_specs to "auto". To do this, the local - credentials require read access to the GCS or BigQuery training data source. - column_transformations (List[Dict[str, Dict[str, str]]]): - Optional. Transformations to apply to the input columns (i.e. columns other - than the targetColumn). Each transformation may produce multiple - result values from the column's value, and all are used for training. - When creating transformation for BigQuery Struct column, the column - should be flattened using "." as the delimiter. Only columns with no child - should have a transformation. - If an input column has no transformations on it, such a column is - ignored by the training, except for the targetColumn, which should have - no transformations defined on. - Only one of column_transformations or column_specs should be passed. - Consider using column_specs as column_transformations will be deprecated - eventually. If none of column_transformations or column_specs is passed, - the local credentials being used will try setting column_transformations to - "auto". To do this, the local credentials require read access to the GCS or - BigQuery training data source. - optimization_objective_recall_value (float): - Optional. Required when maximize-precision-at-recall optimizationObjective was - picked, represents the recall value at which the optimization is done. - The minimum value is 0 and the maximum is 1.0. - optimization_objective_precision_value (float): - Optional. Required when maximize-recall-at-precision optimizationObjective was - picked, represents the precision value at which the optimization is - done. + TODO(b/169782082) add documentation about traning utilities + To ensure your model gets saved in Vertex AI, write your saved model to + os.environ["AIP_MODEL_DIR"] in your provided training script. - The minimum value is 0 and the maximum is 1.0. + + Args: + display_name (str): + Required. The user-defined name of this TrainingPipeline. + container_uri (str): + Required: Uri of the training container image in the GCR. + command (Sequence[str]): + The command to be invoked when the container is started. + It overrides the entrypoint instruction in Dockerfile when provided + model_serving_container_image_uri (str): + If the training produces a managed Vertex AI Model, the URI of the + Model serving container suitable for serving the model produced by the + training script. + model_serving_container_predict_route (str): + If the training produces a managed Vertex AI Model, An HTTP path to + send prediction requests to the container, and which must be supported + by it. If not specified a default HTTP path will be used by Vertex AI. + model_serving_container_health_route (str): + If the training produces a managed Vertex AI Model, an HTTP path to + send health check requests to the container, and which must be supported + by it. If not specified a standard HTTP path will be used by AI + Platform. + model_serving_container_command (Sequence[str]): + The command with which the container is run. Not executed within a + shell. The Docker image's ENTRYPOINT is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container's + environment. If a variable cannot be resolved, the reference in the + input string will be unchanged. The $(VAR_NAME) syntax can be escaped + with a double $$, ie: $$(VAR_NAME). Escaped references will never be + expanded, regardless of whether the variable exists or not. + model_serving_container_args (Sequence[str]): + The arguments to the command. The Docker image's CMD is used if this is + not provided. Variable references $(VAR_NAME) are expanded using the + container's environment. If a variable cannot be resolved, the reference + in the input string will be unchanged. The $(VAR_NAME) syntax can be + escaped with a double $$, ie: $$(VAR_NAME). Escaped references will + never be expanded, regardless of whether the variable exists or not. + model_serving_container_environment_variables (Dict[str, str]): + The environment variables that are to be present in the container. + Should be a dictionary where keys are environment variable names + and values are environment variable values for those names. + model_serving_container_ports (Sequence[int]): + Declaration of ports that are exposed by the container. This field is + primarily informational, it gives Vertex AI information about the + network connections the container uses. Listing or not a port here has + no impact on whether the port is actually exposed, any port listening on + the default "0.0.0.0" address inside a container will be accessible from + the network. + model_description (str): + The description of the Model. + model_instance_schema_uri (str): + Optional. Points to a YAML file stored on Google Cloud + Storage describing the format of a single instance, which + are used in + ``PredictRequest.instances``, + ``ExplainRequest.instances`` + and + ``BatchPredictionJob.input_config``. + The schema is defined as an OpenAPI 3.0.2 `Schema + Object `__. + AutoML Models always have this field populated by AI + Platform. Note: The URI given on output will be immutable + and probably different, including the URI scheme, than the + one given on input. The output URI will point to a location + where the user only has a read access. + model_parameters_schema_uri (str): + Optional. Points to a YAML file stored on Google Cloud + Storage describing the parameters of prediction and + explanation via + ``PredictRequest.parameters``, + ``ExplainRequest.parameters`` + and + ``BatchPredictionJob.model_parameters``. + The schema is defined as an OpenAPI 3.0.2 `Schema + Object `__. + AutoML Models always have this field populated by AI + Platform, if no parameters are supported it is set to an + empty string. Note: The URI given on output will be + immutable and probably different, including the URI scheme, + than the one given on input. The output URI will point to a + location where the user only has a read access. + model_prediction_schema_uri (str): + Optional. Points to a YAML file stored on Google Cloud + Storage describing the format of a single prediction + produced by this Model, which are returned via + ``PredictResponse.predictions``, + ``ExplainResponse.explanations``, + and + ``BatchPredictionJob.output_config``. + The schema is defined as an OpenAPI 3.0.2 `Schema + Object `__. + AutoML Models always have this field populated by AI + Platform. Note: The URI given on output will be immutable + and probably different, including the URI scheme, than the + one given on input. The output URI will point to a location + where the user only has a read access. project (str): - Optional. Project to run training in. Overrides project set in aiplatform.init. + Project to run training in. Overrides project set in aiplatform.init. location (str): - Optional. Location to run training in. Overrides location set in aiplatform.init. + Location to run training in. Overrides location set in aiplatform.init. credentials (auth_credentials.Credentials): - Optional. Custom credentials to use to run call training service. Overrides + Custom credentials to use to run call training service. Overrides credentials set in aiplatform.init. labels (Dict[str, str]): Optional. The labels with user-defined metadata to @@ -3378,9 +3474,9 @@ def __init__( If set, the trained Model will be secured by this key. Overrides encryption_spec_key_name set in aiplatform.init. - - Raises: - ValueError: If both column_transformations and column_specs were provided. + staging_bucket (str): + Bucket used to stage source and training artifacts. Overrides + staging_bucket set in aiplatform.init. """ if not display_name: display_name = self.__class__._generate_display_name() @@ -3392,45 +3488,73 @@ def __init__( labels=labels, training_encryption_spec_key_name=training_encryption_spec_key_name, model_encryption_spec_key_name=model_encryption_spec_key_name, + container_uri=container_uri, + model_instance_schema_uri=model_instance_schema_uri, + model_parameters_schema_uri=model_parameters_schema_uri, + model_prediction_schema_uri=model_prediction_schema_uri, + model_serving_container_environment_variables=model_serving_container_environment_variables, + model_serving_container_ports=model_serving_container_ports, + model_serving_container_image_uri=model_serving_container_image_uri, + model_serving_container_command=model_serving_container_command, + model_serving_container_args=model_serving_container_args, + model_serving_container_predict_route=model_serving_container_predict_route, + model_serving_container_health_route=model_serving_container_health_route, + model_description=model_description, + staging_bucket=staging_bucket, ) - self._column_transformations = ( - column_transformations_utils.validate_and_get_column_transformations( - column_specs, column_transformations - ) - ) - - self._optimization_objective = optimization_objective - self._optimization_prediction_type = optimization_prediction_type - self._optimization_objective_recall_value = optimization_objective_recall_value - self._optimization_objective_precision_value = ( - optimization_objective_precision_value - ) - - self._additional_experiments = [] + self._command = command def run( self, - dataset: datasets.TabularDataset, - target_column: str, + dataset: Optional[ + Union[ + datasets.ImageDataset, + datasets.TabularDataset, + datasets.TextDataset, + datasets.VideoDataset, + ] + ] = None, + annotation_schema_uri: Optional[str] = None, + model_display_name: Optional[str] = None, + model_labels: Optional[Dict[str, str]] = None, + base_output_dir: Optional[str] = None, + service_account: Optional[str] = None, + network: Optional[str] = None, + bigquery_destination: Optional[str] = None, + args: Optional[List[Union[str, float, int]]] = None, + environment_variables: Optional[Dict[str, str]] = None, + replica_count: int = 1, + machine_type: str = "n1-standard-4", + accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", + accelerator_count: int = 0, + boot_disk_type: str = "pd-ssd", + boot_disk_size_gb: int = 100, + reduction_server_replica_count: int = 0, + reduction_server_machine_type: Optional[str] = None, + reduction_server_container_uri: Optional[str] = None, training_fraction_split: Optional[float] = None, validation_fraction_split: Optional[float] = None, test_fraction_split: Optional[float] = None, + training_filter_split: Optional[str] = None, + validation_filter_split: Optional[str] = None, + test_filter_split: Optional[str] = None, predefined_split_column_name: Optional[str] = None, timestamp_split_column_name: Optional[str] = None, - weight_column: Optional[str] = None, - budget_milli_node_hours: int = 1000, - model_display_name: Optional[str] = None, - model_labels: Optional[Dict[str, str]] = None, - disable_early_stopping: bool = False, - export_evaluated_data_items: bool = False, - export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None, - export_evaluated_data_items_override_destination: bool = False, - additional_experiments: Optional[List[str]] = None, - sync: bool = True, + timeout: Optional[int] = None, + restart_job_on_worker_restart: bool = False, + enable_web_access: bool = False, + tensorboard: Optional[str] = None, + sync=True, create_request_timeout: Optional[float] = None, - ) -> models.Model: - """Runs the training job and returns a model. + ) -> Optional[models.Model]: + """Runs the custom training job. + + Distributed Training Support: + If replica count = 1 then one chief replica will be provisioned. If + replica_count > 1 the remainder will be provisioned as a worker replica pool. + ie: replica_count = 10 will result in 1 chief and 9 workers + All replicas have same machine_type, accelerator_type, and accelerator_count If training on a Vertex AI dataset, you can use one of the following split configurations: Data fraction splits: @@ -3440,6 +3564,17 @@ def run( decided by Vertex AI. If none of the fractions are set, by default roughly 80% of data will be used for training, 10% for validation, and 10% for test. + Data filter splits: + Assigns input data to training, validation, and test sets + based on the given filters, data pieces not matched by any + filter are ignored. Currently only supported for Datasets + containing DataItems. + If any of the filters in this message are to match nothing, then + they can be set as '-' (the minus sign). + If using filter splits, all of ``training_filter_split``, ``validation_filter_split`` and + ``test_filter_split`` must be provided. + Supported only for unstructured Datasets. + Predefined splits: Assigns input data to training, validation, and test sets based on the value of a provided key. If using predefined splits, ``predefined_split_column_name`` must be provided. @@ -3453,16 +3588,126 @@ def run( Supported only for tabular Datasets. Args: - dataset (datasets.TabularDataset): - Required. The dataset within the same Project from which data will be used to train the Model. The - Dataset must use schema compatible with Model being trained, - and what is compatible should be described in the used - TrainingPipeline's [training_task_definition] - [google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]. - For tabular Datasets, all their data is exported to - training, to pick and choose from. - target_column (str): - Required. The name of the column values of which the Model is to predict. + dataset (Union[datasets.ImageDataset,datasets.TabularDataset,datasets.TextDataset,datasets.VideoDataset]): + Vertex AI to fit this training against. Custom training script should + retrieve datasets through passed in environment variables uris: + + os.environ["AIP_TRAINING_DATA_URI"] + os.environ["AIP_VALIDATION_DATA_URI"] + os.environ["AIP_TEST_DATA_URI"] + + Additionally the dataset format is passed in as: + + os.environ["AIP_DATA_FORMAT"] + annotation_schema_uri (str): + Google Cloud Storage URI points to a YAML file describing + annotation schema. The schema is defined as an OpenAPI 3.0.2 + [Schema Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schema-object) The schema files + that can be used here are found in + gs://google-cloud-aiplatform/schema/dataset/annotation/, + note that the chosen schema must be consistent with + ``metadata`` + of the Dataset specified by + ``dataset_id``. + + Only Annotations that both match this schema and belong to + DataItems not ignored by the split method are used in + respectively training, validation or test role, depending on + the role of the DataItem they are on. + + When used in conjunction with + ``annotations_filter``, + the Annotations used for training are filtered by both + ``annotations_filter`` + and + ``annotation_schema_uri``. + model_display_name (str): + If the script produces a managed Vertex AI Model. The display name of + the Model. The name can be up to 128 characters long and can be consist + of any UTF-8 characters. + + If not provided upon creation, the job's display_name is used. + model_labels (Dict[str, str]): + Optional. The labels with user-defined metadata to + organize your Models. + Label keys and values can be no longer than 64 + characters (Unicode codepoints), can only + contain lowercase letters, numeric characters, + underscores and dashes. International characters + are allowed. + See https://goo.gl/xmQnxf for more information + and examples of labels. + base_output_dir (str): + GCS output directory of job. If not provided a + timestamped directory in the staging directory will be used. + + Vertex AI sets the following environment variables when it runs your training code: + + - AIP_MODEL_DIR: a Cloud Storage URI of a directory intended for saving model artifacts, i.e. /model/ + - AIP_CHECKPOINT_DIR: a Cloud Storage URI of a directory intended for saving checkpoints, i.e. /checkpoints/ + - AIP_TENSORBOARD_LOG_DIR: a Cloud Storage URI of a directory intended for saving TensorBoard logs, i.e. /logs/ + + service_account (str): + Specifies the service account for workload run-as account. + Users submitting jobs must have act-as permission on this run-as account. + network (str): + The full name of the Compute Engine network to which the job + should be peered. For example, projects/12345/global/networks/myVPC. + Private services access must already be configured for the network. + If left unspecified, the job is not peered with any network. + bigquery_destination (str): + Provide this field if `dataset` is a BiqQuery dataset. + The BigQuery project location where the training data is to + be written to. In the given project a new dataset is created + with name + ``dataset___`` + where timestamp is in YYYY_MM_DDThh_mm_ss_sssZ format. All + training input data will be written into that dataset. In + the dataset three tables will be created, ``training``, + ``validation`` and ``test``. + + - AIP_DATA_FORMAT = "bigquery". + - AIP_TRAINING_DATA_URI ="bigquery_destination.dataset_*.training" + - AIP_VALIDATION_DATA_URI = "bigquery_destination.dataset_*.validation" + - AIP_TEST_DATA_URI = "bigquery_destination.dataset_*.test" + args (List[Unions[str, int, float]]): + Command line arguments to be passed to the Python script. + environment_variables (Dict[str, str]): + Environment variables to be passed to the container. + Should be a dictionary where keys are environment variable names + and values are environment variable values for those names. + At most 10 environment variables can be specified. + The Name of the environment variable must be unique. + + environment_variables = { + 'MY_KEY': 'MY_VALUE' + } + replica_count (int): + The number of worker replicas. If replica count = 1 then one chief + replica will be provisioned. If replica_count > 1 the remainder will be + provisioned as a worker replica pool. + machine_type (str): + The type of machine to use for training. + accelerator_type (str): + Hardware accelerator type. One of ACCELERATOR_TYPE_UNSPECIFIED, + NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, + NVIDIA_TESLA_T4 + accelerator_count (int): + The number of accelerators to attach to a worker replica. + boot_disk_type (str): + Type of the boot disk, default is `pd-ssd`. + Valid values: `pd-ssd` (Persistent Disk Solid State Drive) or + `pd-standard` (Persistent Disk Hard Disk Drive). + boot_disk_size_gb (int): + Size in GB of the boot disk, default is 100GB. + boot disk size must be within the range of [100, 64000]. + reduction_server_replica_count (int): + The number of reduction server replicas, default is 0. + reduction_server_machine_type (str): + Optional. The type of machine to use for reduction server. + reduction_server_container_uri (str): + Optional. The Uri of the reduction server container image. + See details: https://cloud.google.com/vertex-ai/docs/training/distributed-training#reduce_training_time_with_reduction_server training_fraction_split (float): Optional. The fraction of the input data that is to be used to train the Model. This is ignored if Dataset is not provided. @@ -3472,6 +3717,27 @@ def run( test_fraction_split (float): Optional. The fraction of the input data that is to be used to evaluate the Model. This is ignored if Dataset is not provided. + training_filter_split (str): + Optional. A filter on DataItems of the Dataset. DataItems that match + this filter are used to train the Model. A filter with same syntax + as the one used in DatasetService.ListDataItems may be used. If a + single DataItem is matched by more than one of the FilterSplit filters, + then it is assigned to the first set that applies to it in the training, + validation, test order. This is ignored if Dataset is not provided. + validation_filter_split (str): + Optional. A filter on DataItems of the Dataset. DataItems that match + this filter are used to validate the Model. A filter with same syntax + as the one used in DatasetService.ListDataItems may be used. If a + single DataItem is matched by more than one of the FilterSplit filters, + then it is assigned to the first set that applies to it in the training, + validation, test order. This is ignored if Dataset is not provided. + test_filter_split (str): + Optional. A filter on DataItems of the Dataset. DataItems that match + this filter are used to test the Model. A filter with same syntax + as the one used in DatasetService.ListDataItems may be used. If a + single DataItem is matched by more than one of the FilterSplit filters, + then it is assigned to the first set that applies to it in the training, + validation, test order. This is ignored if Dataset is not provided. predefined_split_column_name (str): Optional. The key is a name of one of the Dataset's data columns. The value of the key (either the label's value or @@ -3489,174 +3755,198 @@ def run( `time-offset` = `"Z"` (e.g. 1985-04-12T23:20:50.52Z). If for a piece of data the key is not present or has an invalid value, that piece is ignored by the pipeline. - Supported only for tabular and time series Datasets. - This parameter must be used with training_fraction_split, - validation_fraction_split, and test_fraction_split. - weight_column (str): - Optional. Name of the column that should be used as the weight column. - Higher values in this column give more importance to the row - during Model training. The column must have numeric values between 0 and - 10000 inclusively, and 0 value means that the row is ignored. - If the weight column field is not set, then all rows are assumed to have - equal weight of 1. - budget_milli_node_hours (int): - Optional. The train budget of creating this Model, expressed in milli node - hours i.e. 1,000 value in this field means 1 node hour. - The training cost of the model will not exceed this budget. The final - cost will be attempted to be close to the budget, though may end up - being (even) noticeably smaller - at the backend's discretion. This - especially may happen when further model training ceases to provide - any improvements. - If the budget is set to a value known to be insufficient to train a - Model for the given training set, the training won't be attempted and - will error. - The minimum value is 1000 and the maximum is 72000. - model_display_name (str): - Optional. If the script produces a managed Vertex AI Model. The display name of - the Model. The name can be up to 128 characters long and can be consist - of any UTF-8 characters. - - If not provided upon creation, the job's display_name is used. - model_labels (Dict[str, str]): - Optional. The labels with user-defined metadata to - organize your Models. - Label keys and values can be no longer than 64 - characters (Unicode codepoints), can only - contain lowercase letters, numeric characters, - underscores and dashes. International characters - are allowed. - See https://goo.gl/xmQnxf for more information - and examples of labels. - disable_early_stopping (bool): - Required. If true, the entire budget is used. This disables the early stopping - feature. By default, the early stopping feature is enabled, which means - that training might stop before the entire training budget has been - used, if further training does no longer brings significant improvement - to the model. - export_evaluated_data_items (bool): - Whether to export the test set predictions to a BigQuery table. - If False, then the export is not performed. - export_evaluated_data_items_bigquery_destination_uri (string): - Optional. URI of desired destination BigQuery table for exported test set predictions. - Expected format: - ``bq://::
`` + Supported only for tabular and time series Datasets. + timeout (int): + The maximum job running time in seconds. The default is 7 days. + restart_job_on_worker_restart (bool): + Restarts the entire CustomJob if a worker + gets restarted. This feature can be used by + distributed training jobs that are not resilient + to workers leaving and joining a job. + enable_web_access (bool): + Whether you want Vertex AI to enable interactive shell access + to training containers. + https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell + tensorboard (str): + Optional. The name of a Vertex AI + [Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard] + resource to which this CustomJob will upload Tensorboard + logs. Format: + ``projects/{project}/locations/{location}/tensorboards/{tensorboard}`` - If not specified, then results are exported to the following auto-created BigQuery - table: - ``:export_evaluated_examples__.evaluated_examples`` + The training script should write Tensorboard to following Vertex AI environment + variable: - Applies only if [export_evaluated_data_items] is True. - export_evaluated_data_items_override_destination (bool): - Whether to override the contents of [export_evaluated_data_items_bigquery_destination_uri], - if the table exists, for exported test set predictions. If False, and the - table exists, then the training job will fail. + AIP_TENSORBOARD_LOG_DIR - Applies only if [export_evaluated_data_items] is True and - [export_evaluated_data_items_bigquery_destination_uri] is specified. - additional_experiments (List[str]): - Optional. Additional experiment flags for the automl tables training. + `service_account` is required with provided `tensorboard`. + For more information on configuring your service account please visit: + https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training sync (bool): Whether to execute this method synchronously. If False, this method will be executed in concurrent Future and any downstream object will be immediately returned and synced when the Future has completed. create_request_timeout (float): Optional. The timeout for the create request in seconds. + Returns: model: The trained Vertex AI Model resource or None if training did not produce a Vertex AI Model. Raises: - RuntimeError: If Training job has already been run or is waiting to run. + RuntimeError: If Training job has already been run, staging_bucket has not + been set, or model_display_name was provided but required arguments + were not provided in constructor. """ - if model_display_name: - utils.validate_display_name(model_display_name) - if model_labels: - utils.validate_labels(model_labels) - - if self._is_waiting_to_run(): - raise RuntimeError("AutoML Tabular Training is already scheduled to run.") - - if self._has_run: - raise RuntimeError("AutoML Tabular Training has already run.") - - if additional_experiments: - self._add_additional_experiments(additional_experiments) + worker_pool_specs, managed_model = self._prepare_and_validate_run( + model_display_name=model_display_name, + model_labels=model_labels, + replica_count=replica_count, + machine_type=machine_type, + accelerator_count=accelerator_count, + accelerator_type=accelerator_type, + boot_disk_type=boot_disk_type, + boot_disk_size_gb=boot_disk_size_gb, + reduction_server_replica_count=reduction_server_replica_count, + reduction_server_machine_type=reduction_server_machine_type, + ) return self._run( dataset=dataset, - target_column=target_column, + annotation_schema_uri=annotation_schema_uri, + worker_pool_specs=worker_pool_specs, + managed_model=managed_model, + args=args, + environment_variables=environment_variables, + base_output_dir=base_output_dir, + service_account=service_account, + network=network, + bigquery_destination=bigquery_destination, training_fraction_split=training_fraction_split, validation_fraction_split=validation_fraction_split, test_fraction_split=test_fraction_split, + training_filter_split=training_filter_split, + validation_filter_split=validation_filter_split, + test_filter_split=test_filter_split, predefined_split_column_name=predefined_split_column_name, timestamp_split_column_name=timestamp_split_column_name, - weight_column=weight_column, - budget_milli_node_hours=budget_milli_node_hours, - model_display_name=model_display_name, - model_labels=model_labels, - disable_early_stopping=disable_early_stopping, - export_evaluated_data_items=export_evaluated_data_items, - export_evaluated_data_items_bigquery_destination_uri=export_evaluated_data_items_bigquery_destination_uri, - export_evaluated_data_items_override_destination=export_evaluated_data_items_override_destination, + timeout=timeout, + restart_job_on_worker_restart=restart_job_on_worker_restart, + enable_web_access=enable_web_access, + tensorboard=tensorboard, + reduction_server_container_uri=reduction_server_container_uri + if reduction_server_replica_count > 0 + else None, sync=sync, create_request_timeout=create_request_timeout, ) - @base.optional_sync() + @base.optional_sync(construct_object_on_arg="managed_model") def _run( self, - dataset: datasets.TabularDataset, - target_column: str, + dataset: Optional[ + Union[ + datasets.ImageDataset, + datasets.TabularDataset, + datasets.TextDataset, + datasets.VideoDataset, + ] + ], + annotation_schema_uri: Optional[str], + worker_pool_specs: worker_spec_utils._DistributedTrainingSpec, + managed_model: Optional[gca_model.Model] = None, + args: Optional[List[Union[str, float, int]]] = None, + environment_variables: Optional[Dict[str, str]] = None, + base_output_dir: Optional[str] = None, + service_account: Optional[str] = None, + network: Optional[str] = None, + bigquery_destination: Optional[str] = None, training_fraction_split: Optional[float] = None, validation_fraction_split: Optional[float] = None, test_fraction_split: Optional[float] = None, + training_filter_split: Optional[str] = None, + validation_filter_split: Optional[str] = None, + test_filter_split: Optional[str] = None, predefined_split_column_name: Optional[str] = None, timestamp_split_column_name: Optional[str] = None, - weight_column: Optional[str] = None, - budget_milli_node_hours: int = 1000, - model_display_name: Optional[str] = None, - model_labels: Optional[Dict[str, str]] = None, - disable_early_stopping: bool = False, - export_evaluated_data_items: bool = False, - export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None, - export_evaluated_data_items_override_destination: bool = False, - sync: bool = True, + timeout: Optional[int] = None, + restart_job_on_worker_restart: bool = False, + enable_web_access: bool = False, + tensorboard: Optional[str] = None, + reduction_server_container_uri: Optional[str] = None, + sync=True, create_request_timeout: Optional[float] = None, - ) -> models.Model: - """Runs the training job and returns a model. + ) -> Optional[models.Model]: + """Packages local script and launches training_job. + Args: + dataset ( + Union[ + datasets.ImageDataset, + datasets.TabularDataset, + datasets.TextDataset, + datasets.VideoDataset, + ] + ): + Vertex AI to fit this training against. + annotation_schema_uri (str): + Google Cloud Storage URI points to a YAML file describing + annotation schema. + worker_pools_spec (worker_spec_utils._DistributedTrainingSpec): + Worker pools pecs required to run job. + managed_model (gca_model.Model): + Model proto if this script produces a Managed Model. + args (List[Unions[str, int, float]]): + Command line arguments to be passed to the Python script. + environment_variables (Dict[str, str]): + Environment variables to be passed to the container. + Should be a dictionary where keys are environment variable names + and values are environment variable values for those names. + At most 10 environment variables can be specified. + The Name of the environment variable must be unique. - If training on a Vertex AI dataset, you can use one of the following split configurations: - Data fraction splits: - Any of ``training_fraction_split``, ``validation_fraction_split`` and - ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If - the provided ones sum to less than 1, the remainder is assigned to sets as - decided by Vertex AI. If none of the fractions are set, by default roughly 80% - of data will be used for training, 10% for validation, and 10% for test. + environment_variables = { + 'MY_KEY': 'MY_VALUE' + } + base_output_dir (str): + GCS output directory of job. If not provided a + timestamped directory in the staging directory will be used. - Predefined splits: - Assigns input data to training, validation, and test sets based on the value of a provided key. - If using predefined splits, ``predefined_split_column_name`` must be provided. - Supported only for tabular Datasets. + Vertex AI sets the following environment variables when it runs your training code: - Timestamp splits: - Assigns input data to training, validation, and test sets - based on a provided timestamps. The youngest data pieces are - assigned to training set, next to validation set, and the oldest - to the test set. - Supported only for tabular Datasets. + - AIP_MODEL_DIR: a Cloud Storage URI of a directory intended for saving model artifacts, i.e. /model/ + - AIP_CHECKPOINT_DIR: a Cloud Storage URI of a directory intended for saving checkpoints, i.e. /checkpoints/ + - AIP_TENSORBOARD_LOG_DIR: a Cloud Storage URI of a directory intended for saving TensorBoard logs, i.e. /logs/ - Args: - dataset (datasets.TabularDataset): - Required. The dataset within the same Project from which data will be used to train the Model. The - Dataset must use schema compatible with Model being trained, - and what is compatible should be described in the used - TrainingPipeline's [training_task_definition] - [google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]. - For tabular Datasets, all their data is exported to - training, to pick and choose from. - target_column (str): - Required. The name of the column values of which the Model is to predict. + service_account (str): + Specifies the service account for workload run-as account. + Users submitting jobs must have act-as permission on this run-as account. + network (str): + The full name of the Compute Engine network to which the job + should be peered. For example, projects/12345/global/networks/myVPC. + Private services access must already be configured for the network. + If left unspecified, the job is not peered with any network. + timeout (int): + The maximum job running time in seconds. The default is 7 days. + restart_job_on_worker_restart (bool): + Restarts the entire CustomJob if a worker + gets restarted. This feature can be used by + distributed training jobs that are not resilient + to workers leaving and joining a job. + bigquery_destination (str): + The BigQuery project location where the training data is to + be written to. In the given project a new dataset is created + with name + ``dataset___`` + where timestamp is in YYYY_MM_DDThh_mm_ss_sssZ format. All + training input data will be written into that dataset. In + the dataset three tables will be created, ``training``, + ``validation`` and ``test``. + + - AIP_DATA_FORMAT = "bigquery". + - AIP_TRAINING_DATA_URI ="bigquery_destination.dataset_*.training" + - AIP_VALIDATION_DATA_URI = "bigquery_destination.dataset_*.validation" + - AIP_TEST_DATA_URI = "bigquery_destination.dataset_*.test" training_fraction_split (float): Optional. The fraction of the input data that is to be used to train the Model. This is ignored if Dataset is not provided. @@ -3666,88 +3956,67 @@ def _run( test_fraction_split (float): Optional. The fraction of the input data that is to be used to evaluate the Model. This is ignored if Dataset is not provided. + training_filter_split (str): + Optional. A filter on DataItems of the Dataset. DataItems that match + this filter are used to train the Model. A filter with same syntax + as the one used in DatasetService.ListDataItems may be used. If a + single DataItem is matched by more than one of the FilterSplit filters, + then it is assigned to the first set that applies to it in the training, + validation, test order. This is ignored if Dataset is not provided. + validation_filter_split (str): + Optional. A filter on DataItems of the Dataset. DataItems that match + this filter are used to validate the Model. A filter with same syntax + as the one used in DatasetService.ListDataItems may be used. If a + single DataItem is matched by more than one of the FilterSplit filters, + then it is assigned to the first set that applies to it in the training, + validation, test order. This is ignored if Dataset is not provided. + test_filter_split (str): + Optional. A filter on DataItems of the Dataset. DataItems that match + this filter are used to test the Model. A filter with same syntax + as the one used in DatasetService.ListDataItems may be used. If a + single DataItem is matched by more than one of the FilterSplit filters, + then it is assigned to the first set that applies to it in the training, + validation, test order. This is ignored if Dataset is not provided. predefined_split_column_name (str): - Optional. The key is a name of one of the Dataset's data - columns. The value of the key (either the label's value or - value in the column) must be one of {``training``, - ``validation``, ``test``}, and it defines to which set the - given piece of data is assigned. If for a piece of data the - key is not present or has an invalid value, that piece is - ignored by the pipeline. - - Supported only for tabular and time series Datasets. - timestamp_split_column_name (str): - Optional. The key is a name of one of the Dataset's data - columns. The value of the key values of the key (the values in - the column) must be in RFC 3339 `date-time` format, where - `time-offset` = `"Z"` (e.g. 1985-04-12T23:20:50.52Z). If for a - piece of data the key is not present or has an invalid value, - that piece is ignored by the pipeline. - Supported only for tabular and time series Datasets. - This parameter must be used with training_fraction_split, - validation_fraction_split, and test_fraction_split. - weight_column (str): - Optional. Name of the column that should be used as the weight column. - Higher values in this column give more importance to the row - during Model training. The column must have numeric values between 0 and - 10000 inclusively, and 0 value means that the row is ignored. - If the weight column field is not set, then all rows are assumed to have - equal weight of 1. - budget_milli_node_hours (int): - Optional. The train budget of creating this Model, expressed in milli node - hours i.e. 1,000 value in this field means 1 node hour. - The training cost of the model will not exceed this budget. The final - cost will be attempted to be close to the budget, though may end up - being (even) noticeably smaller - at the backend's discretion. This - especially may happen when further model training ceases to provide - any improvements. - If the budget is set to a value known to be insufficient to train a - Model for the given training set, the training won't be attempted and - will error. - The minimum value is 1000 and the maximum is 72000. - model_display_name (str): - Optional. If the script produces a managed Vertex AI Model. The display name of - the Model. The name can be up to 128 characters long and can be consist - of any UTF-8 characters. + Optional. The key is a name of one of the Dataset's data + columns. The value of the key (either the label's value or + value in the column) must be one of {``training``, + ``validation``, ``test``}, and it defines to which set the + given piece of data is assigned. If for a piece of data the + key is not present or has an invalid value, that piece is + ignored by the pipeline. - If not provided upon creation, the job's display_name is used. - model_labels (Dict[str, str]): - Optional. The labels with user-defined metadata to - organize your Models. - Label keys and values can be no longer than 64 - characters (Unicode codepoints), can only - contain lowercase letters, numeric characters, - underscores and dashes. International characters - are allowed. - See https://goo.gl/xmQnxf for more information - and examples of labels. - disable_early_stopping (bool): - Required. If true, the entire budget is used. This disables the early stopping - feature. By default, the early stopping feature is enabled, which means - that training might stop before the entire training budget has been - used, if further training does no longer brings significant improvement - to the model. - export_evaluated_data_items (bool): - Whether to export the test set predictions to a BigQuery table. - If False, then the export is not performed. - export_evaluated_data_items_bigquery_destination_uri (string): - Optional. URI of desired destination BigQuery table for exported test set predictions. + Supported only for tabular and time series Datasets. + timestamp_split_column_name (str): + Optional. The key is a name of one of the Dataset's data + columns. The value of the key values of the key (the values in + the column) must be in RFC 3339 `date-time` format, where + `time-offset` = `"Z"` (e.g. 1985-04-12T23:20:50.52Z). If for a + piece of data the key is not present or has an invalid value, + that piece is ignored by the pipeline. - Expected format: - ``bq://::
`` + Supported only for tabular and time series Datasets. + enable_web_access (bool): + Whether you want Vertex AI to enable interactive shell access + to training containers. + https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell + tensorboard (str): + Optional. The name of a Vertex AI + [Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard] + resource to which this CustomJob will upload Tensorboard + logs. Format: + ``projects/{project}/locations/{location}/tensorboards/{tensorboard}`` - If not specified, then results are exported to the following auto-created BigQuery - table: - ``:export_evaluated_examples__.evaluated_examples`` + The training script should write Tensorboard to following Vertex AI environment + variable: - Applies only if [export_evaluated_data_items] is True. - export_evaluated_data_items_override_destination (bool): - Whether to override the contents of [export_evaluated_data_items_bigquery_destination_uri], - if the table exists, for exported test set predictions. If False, and the - table exists, then the training job will fail. + AIP_TENSORBOARD_LOG_DIR - Applies only if [export_evaluated_data_items] is True and - [export_evaluated_data_items_bigquery_destination_uri] is specified. + `service_account` is required with provided `tensorboard`. + For more information on configuring your service account please visit: + https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training + reduction_server_container_uri (str): + Optional. The Uri of the reduction server container image. sync (bool): Whether to execute this method synchronously. If False, this method will be executed in concurrent Future and any downstream object will @@ -3760,165 +4029,141 @@ def _run( produce a Vertex AI Model. """ - training_task_definition = schema.training_job.definition.automl_tabular - - # auto-populate transformations - if self._column_transformations is None: - _LOGGER.info( - "No column transformations provided, so now retrieving columns from dataset in order to set default column transformations." - ) - - ( - self._column_transformations, - column_names, - ) = column_transformations_utils.get_default_column_transformations( - dataset=dataset, target_column=target_column - ) + for spec_order, spec in enumerate(worker_pool_specs): - _LOGGER.info( - "The column transformation of type 'auto' was set for the following columns: %s." - % column_names - ) + if not spec: + continue - training_task_inputs_dict = { - # required inputs - "targetColumn": target_column, - "transformations": self._column_transformations, - "trainBudgetMilliNodeHours": budget_milli_node_hours, - # optional inputs - "weightColumnName": weight_column, - "disableEarlyStopping": disable_early_stopping, - "optimizationObjective": self._optimization_objective, - "predictionType": self._optimization_prediction_type, - "optimizationObjectiveRecallValue": self._optimization_objective_recall_value, - "optimizationObjectivePrecisionValue": self._optimization_objective_precision_value, - } + if ( + spec_order == worker_spec_utils._SPEC_ORDERS["server_spec"] + and reduction_server_container_uri + ): + spec["container_spec"] = { + "image_uri": reduction_server_container_uri, + } + else: + spec["containerSpec"] = {"imageUri": self._container_uri} - final_export_eval_bq_uri = export_evaluated_data_items_bigquery_destination_uri - if final_export_eval_bq_uri and not final_export_eval_bq_uri.startswith( - "bq://" - ): - final_export_eval_bq_uri = f"bq://{final_export_eval_bq_uri}" + if self._command: + spec["containerSpec"]["command"] = self._command - if export_evaluated_data_items: - training_task_inputs_dict["exportEvaluatedDataItemsConfig"] = { - "destinationBigqueryUri": final_export_eval_bq_uri, - "overrideExistingTable": export_evaluated_data_items_override_destination, - } + if args: + spec["containerSpec"]["args"] = args - if self._additional_experiments: - training_task_inputs_dict[ - "additionalExperiments" - ] = self._additional_experiments + if environment_variables: + spec["containerSpec"]["env"] = [ + {"name": key, "value": value} + for key, value in environment_variables.items() + ] - model = gca_model.Model( - display_name=model_display_name or self._display_name, - labels=model_labels or self._labels, - encryption_spec=self._model_encryption_spec, + ( + training_task_inputs, + base_output_dir, + ) = self._prepare_training_task_inputs_and_output_dir( + worker_pool_specs=worker_pool_specs, + base_output_dir=base_output_dir, + service_account=service_account, + network=network, + timeout=timeout, + restart_job_on_worker_restart=restart_job_on_worker_restart, + enable_web_access=enable_web_access, + tensorboard=tensorboard, ) - return self._run_job( - training_task_definition=training_task_definition, - training_task_inputs=training_task_inputs_dict, + model = self._run_job( + training_task_definition=schema.training_job.definition.custom_task, + training_task_inputs=training_task_inputs, dataset=dataset, + annotation_schema_uri=annotation_schema_uri, training_fraction_split=training_fraction_split, validation_fraction_split=validation_fraction_split, test_fraction_split=test_fraction_split, + training_filter_split=training_filter_split, + validation_filter_split=validation_filter_split, + test_filter_split=test_filter_split, predefined_split_column_name=predefined_split_column_name, timestamp_split_column_name=timestamp_split_column_name, - model=model, + model=managed_model, + gcs_destination_uri_prefix=base_output_dir, + bigquery_destination=bigquery_destination, create_request_timeout=create_request_timeout, ) - @property - def _model_upload_fail_string(self) -> str: - """Helper property for model upload failure.""" - return ( - f"Training Pipeline {self.resource_name} is not configured to upload a " - "Model." - ) + return model - def _add_additional_experiments(self, additional_experiments: List[str]): - """Add experiment flags to the training job. - Args: - additional_experiments (List[str]): - Experiment flags that can enable some experimental training features. - """ - self._additional_experiments.extend(additional_experiments) - @staticmethod - def get_auto_column_specs( - dataset: datasets.TabularDataset, - target_column: str, - ) -> Dict[str, str]: - """Returns a dict with all non-target columns as keys and 'auto' as values. +class AutoMLTabularTrainingJob(_TrainingJob): + _supported_training_schemas = (schema.training_job.definition.automl_tabular,) + + def __init__( + self, + # TODO(b/223262536): Make display_name parameter fully optional in next major release + display_name: str, + optimization_prediction_type: str, + optimization_objective: Optional[str] = None, + column_specs: Optional[Dict[str, str]] = None, + column_transformations: Optional[List[Dict[str, Dict[str, str]]]] = None, + optimization_objective_recall_value: Optional[float] = None, + optimization_objective_precision_value: Optional[float] = None, + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, + labels: Optional[Dict[str, str]] = None, + training_encryption_spec_key_name: Optional[str] = None, + model_encryption_spec_key_name: Optional[str] = None, + ): + """Constructs a AutoML Tabular Training Job. Example usage: - column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( - dataset=my_dataset, - target_column="my_target_column", + job = training_jobs.AutoMLTabularTrainingJob( + display_name="my_display_name", + optimization_prediction_type="classification", + optimization_objective="minimize-log-loss", + column_specs={"column_1": "auto", "column_2": "numeric"}, + labels={'key': 'value'}, ) Args: - dataset (datasets.TabularDataset): - Required. Intended dataset. - target_column(str): - Required. Intended target column. - Returns: - Dict[str, str] - Column names as keys and 'auto' as values - """ - column_names = [ - column for column in dataset.column_names if column != target_column - ] - column_specs = {column: "auto" for column in column_names} - return column_specs + display_name (str): + Required. The user-defined name of this TrainingPipeline. + optimization_prediction_type (str): + The type of prediction the Model is to produce. + "classification" - Predict one out of multiple target values is + picked for each row. + "regression" - Predict a value based on its relation to other values. + This type is available only to columns that contain + semantically numeric values, i.e. integers or floating + point number, even if stored as e.g. strings. - class column_data_types: - AUTO = "auto" - NUMERIC = "numeric" - CATEGORICAL = "categorical" - TIMESTAMP = "timestamp" - TEXT = "text" - REPEATED_NUMERIC = "repeated_numeric" - REPEATED_CATEGORICAL = "repeated_categorical" - REPEATED_TEXT = "repeated_text" + optimization_objective (str): + Optional. Objective function the Model is to be optimized towards. The training + task creates a Model that maximizes/minimizes the value of the objective + function over the validation set. + The supported optimization objectives depend on the prediction type, and + in the case of classification also the number of distinct values in the + target column (two distint values -> binary, 3 or more distinct values + -> multi class). + If the field is not set, the default objective function is used. -class AutoMLForecastingTrainingJob(_TrainingJob): - _supported_training_schemas = (schema.training_job.definition.automl_forecasting,) + Classification (binary): + "maximize-au-roc" (default) - Maximize the area under the receiver + operating characteristic (ROC) curve. + "minimize-log-loss" - Minimize log loss. + "maximize-au-prc" - Maximize the area under the precision-recall curve. + "maximize-precision-at-recall" - Maximize precision for a specified + recall value. + "maximize-recall-at-precision" - Maximize recall for a specified + precision value. - def __init__( - self, - display_name: Optional[str] = None, - optimization_objective: Optional[str] = None, - column_specs: Optional[Dict[str, str]] = None, - column_transformations: Optional[List[Dict[str, Dict[str, str]]]] = None, - project: Optional[str] = None, - location: Optional[str] = None, - credentials: Optional[auth_credentials.Credentials] = None, - labels: Optional[Dict[str, str]] = None, - training_encryption_spec_key_name: Optional[str] = None, - model_encryption_spec_key_name: Optional[str] = None, - ): - """Constructs a AutoML Forecasting Training Job. + Classification (multi class): + "minimize-log-loss" (default) - Minimize log loss. - Args: - display_name (str): - Optional. The user-defined name of this TrainingPipeline. - optimization_objective (str): - Optional. Objective function the model is to be optimized towards. - The training process creates a Model that optimizes the value of the objective - function over the validation set. The supported optimization objectives: + Regression: "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE). "minimize-mae" - Minimize mean-absolute error (MAE). "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE). - "minimize-rmspe" - Minimize root-mean-squared percentage error (RMSPE). - "minimize-wape-mae" - Minimize the combination of weighted absolute percentage error (WAPE) - and mean-absolute-error (MAE). - "minimize-quantile-loss" - Minimize the quantile loss at the defined quantiles. - (Set this objective to build quantile forecasts.) column_specs (Dict[str, str]): Optional. Alternative to column_transformations where the keys of the dict are column names and their respective values are one of @@ -3929,7 +4174,10 @@ def __init__( If an input column has no transformations on it, such a column is ignored by the training, except for the targetColumn, which should have no transformations defined on. - Only one of column_transformations or column_specs should be passed. + Only one of column_transformations or column_specs should be passed. If none + of column_transformations or column_specs is passed, the local credentials + being used will try setting column_specs to "auto". To do this, the local + credentials require read access to the GCS or BigQuery training data source. column_transformations (List[Dict[str, Dict[str, str]]]): Optional. Transformations to apply to the input columns (i.e. columns other than the targetColumn). Each transformation may produce multiple @@ -3941,7 +4189,22 @@ def __init__( ignored by the training, except for the targetColumn, which should have no transformations defined on. Only one of column_transformations or column_specs should be passed. - Consider using column_specs as column_transformations will be deprecated eventually. + Consider using column_specs as column_transformations will be deprecated + eventually. If none of column_transformations or column_specs is passed, + the local credentials being used will try setting column_transformations to + "auto". To do this, the local credentials require read access to the GCS or + BigQuery training data source. + optimization_objective_recall_value (float): + Optional. Required when maximize-precision-at-recall optimizationObjective was + picked, represents the recall value at which the optimization is done. + + The minimum value is 0 and the maximum is 1.0. + optimization_objective_precision_value (float): + Optional. Required when maximize-recall-at-precision optimizationObjective was + picked, represents the precision value at which the optimization is + done. + + The minimum value is 0 and the maximum is 1.0. project (str): Optional. Project to run training in. Overrides project set in aiplatform.init. location (str): @@ -4007,44 +4270,32 @@ def __init__( ) self._optimization_objective = optimization_objective + self._optimization_prediction_type = optimization_prediction_type + self._optimization_objective_recall_value = optimization_objective_recall_value + self._optimization_objective_precision_value = ( + optimization_objective_precision_value + ) + self._additional_experiments = [] def run( self, - dataset: datasets.TimeSeriesDataset, + dataset: datasets.TabularDataset, target_column: str, - time_column: str, - time_series_identifier_column: str, - unavailable_at_forecast_columns: List[str], - available_at_forecast_columns: List[str], - forecast_horizon: int, - data_granularity_unit: str, - data_granularity_count: int, training_fraction_split: Optional[float] = None, validation_fraction_split: Optional[float] = None, test_fraction_split: Optional[float] = None, predefined_split_column_name: Optional[str] = None, timestamp_split_column_name: Optional[str] = None, weight_column: Optional[str] = None, - time_series_attribute_columns: Optional[List[str]] = None, - context_window: Optional[int] = None, - export_evaluated_data_items: bool = False, - export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None, - export_evaluated_data_items_override_destination: bool = False, - quantiles: Optional[List[float]] = None, - validation_options: Optional[str] = None, budget_milli_node_hours: int = 1000, model_display_name: Optional[str] = None, model_labels: Optional[Dict[str, str]] = None, + disable_early_stopping: bool = False, + export_evaluated_data_items: bool = False, + export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None, + export_evaluated_data_items_override_destination: bool = False, additional_experiments: Optional[List[str]] = None, - hierarchy_group_columns: Optional[List[str]] = None, - hierarchy_group_total_weight: Optional[float] = None, - hierarchy_temporal_total_weight: Optional[float] = None, - hierarchy_group_temporal_total_weight: Optional[float] = None, - window_column: Optional[str] = None, - window_stride_length: Optional[int] = None, - window_max_count: Optional[int] = None, - holiday_regions: Optional[List[str]] = None, sync: bool = True, create_request_timeout: Optional[float] = None, ) -> models.Model: @@ -4071,47 +4322,30 @@ def run( Supported only for tabular Datasets. Args: - dataset (datasets.TimeSeriesDataset): + dataset (datasets.TabularDataset): Required. The dataset within the same Project from which data will be used to train the Model. The Dataset must use schema compatible with Model being trained, and what is compatible should be described in the used TrainingPipeline's [training_task_definition] [google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]. - For time series Datasets, all their data is exported to + For tabular Datasets, all their data is exported to training, to pick and choose from. target_column (str): - Required. Name of the column that the Model is to predict values for. This - column must be unavailable at forecast. - time_column (str): - Required. Name of the column that identifies time order in the time series. - This column must be available at forecast. - time_series_identifier_column (str): - Required. Name of the column that identifies the time series. - unavailable_at_forecast_columns (List[str]): - Required. Column names of columns that are unavailable at forecast. - Each column contains information for the given entity (identified by the - [time_series_identifier_column]) that is unknown before the forecast - (e.g. population of a city in a given year, or weather on a given day). - available_at_forecast_columns (List[str]): - Required. Column names of columns that are available at forecast. - Each column contains information for the given entity (identified by the - [time_series_identifier_column]) that is known at forecast. - forecast_horizon: (int): - Required. The amount of time into the future for which forecasted values for the target are - returned. Expressed in number of units defined by the [data_granularity_unit] and - [data_granularity_count] field. Inclusive. - data_granularity_unit (str): - Required. The data granularity unit. Accepted values are ``minute``, - ``hour``, ``day``, ``week``, ``month``, ``year``. - data_granularity_count (int): - Required. The number of data granularity units between data points in the training - data. If [data_granularity_unit] is `minute`, can be 1, 5, 10, 15, or 30. For all other - values of [data_granularity_unit], must be 1. + Required. The name of the column values of which the Model is to predict. + training_fraction_split (float): + Optional. The fraction of the input data that is to be used to train + the Model. This is ignored if Dataset is not provided. + validation_fraction_split (float): + Optional. The fraction of the input data that is to be used to validate + the Model. This is ignored if Dataset is not provided. + test_fraction_split (float): + Optional. The fraction of the input data that is to be used to evaluate + the Model. This is ignored if Dataset is not provided. predefined_split_column_name (str): Optional. The key is a name of one of the Dataset's data columns. The value of the key (either the label's value or - value in the column) must be one of {``TRAIN``, - ``VALIDATE``, ``TEST``}, and it defines to which set the + value in the column) must be one of {``training``, + ``validation``, ``test``}, and it defines to which set the given piece of data is assigned. If for a piece of data the key is not present or has an invalid value, that piece is ignored by the pipeline. @@ -4133,49 +4367,7 @@ def run( during Model training. The column must have numeric values between 0 and 10000 inclusively, and 0 value means that the row is ignored. If the weight column field is not set, then all rows are assumed to have - equal weight of 1. This column must be available at forecast. - time_series_attribute_columns (List[str]): - Optional. Column names that should be used as attribute columns. - Each column is constant within a time series. - context_window (int): - Optional. The amount of time into the past training and prediction data is used for - model training and prediction respectively. Expressed in number of units defined by the - [data_granularity_unit] and [data_granularity_count] fields. When not provided uses the - default value of 0 which means the model sets each series context window to be 0 (also - known as "cold start"). Inclusive. - export_evaluated_data_items (bool): - Whether to export the test set predictions to a BigQuery table. - If False, then the export is not performed. - export_evaluated_data_items_bigquery_destination_uri (string): - Optional. URI of desired destination BigQuery table for exported test set predictions. - - Expected format: - ``bq://::
`` - - If not specified, then results are exported to the following auto-created BigQuery - table: - ``:export_evaluated_examples__.evaluated_examples`` - - Applies only if [export_evaluated_data_items] is True. - export_evaluated_data_items_override_destination (bool): - Whether to override the contents of [export_evaluated_data_items_bigquery_destination_uri], - if the table exists, for exported test set predictions. If False, and the - table exists, then the training job will fail. - - Applies only if [export_evaluated_data_items] is True and - [export_evaluated_data_items_bigquery_destination_uri] is specified. - quantiles (List[float]): - Quantiles to use for the ``minimize-quantile-loss`` - [AutoMLForecastingTrainingJob.optimization_objective]. This argument is required in - this case. - - Accepts up to 5 quantiles in the form of a double from 0 to 1, exclusive. - Each quantile must be unique. - validation_options (str): - Validation options for the data validation component. The available options are: - "fail-pipeline" - (default), will validate against the validation and fail the pipeline - if it fails. - "ignore-validation" - ignore the results of the validation and continue the pipeline + equal weight of 1. budget_milli_node_hours (int): Optional. The train budget of creating this Model, expressed in milli node hours i.e. 1,000 value in this field means 1 node hour. @@ -4204,58 +4396,41 @@ def run( are allowed. See https://goo.gl/xmQnxf for more information and examples of labels. - additional_experiments (List[str]): - Optional. Additional experiment flags for the time series forcasting training. - create_request_timeout (float): - Optional. The timeout for the create request in seconds. - hierarchy_group_columns (List[str]): - Optional. A list of time series attribute column names that - define the time series hierarchy. Only one level of hierarchy is - supported, ex. ``region`` for a hierarchy of stores or - ``department`` for a hierarchy of products. If multiple columns - are specified, time series will be grouped by their combined - values, ex. (``blue``, ``large``) for ``color`` and ``size``, up - to 5 columns are accepted. If no group columns are specified, - all time series are considered to be part of the same group. - hierarchy_group_total_weight (float): - Optional. The weight of the loss for predictions aggregated over - time series in the same hierarchy group. - hierarchy_temporal_total_weight (float): - Optional. The weight of the loss for predictions aggregated over - the horizon for a single time series. - hierarchy_group_temporal_total_weight (float): - Optional. The weight of the loss for predictions aggregated over - both the horizon and time series in the same hierarchy group. - window_column (str): - Optional. Name of the column that should be used to filter input - rows. The column should contain either booleans or string - booleans; if the value of the row is True, generate a sliding - window from that row. - window_stride_length (int): - Optional. Step length used to generate input examples. Every - ``window_stride_length`` rows will be used to generate a sliding - window. - window_max_count (int): - Optional. Number of rows that should be used to generate input - examples. If the total row count is larger than this number, the - input data will be randomly sampled to hit the count. - holiday_regions (List[str]): - Optional. The geographical regions to use when creating holiday - features. This option is only allowed when data_granularity_unit - is ``day``. Acceptable values can come from any of the following - levels: - Top level: GLOBAL - Second level: continental regions - NA: North America - JAPAC: Japan and Asia Pacific - EMEA: Europe, the Middle East and Africa - LAC: Latin America and the Caribbean - Third level: countries from ISO 3166-1 Country codes. + disable_early_stopping (bool): + Required. If true, the entire budget is used. This disables the early stopping + feature. By default, the early stopping feature is enabled, which means + that training might stop before the entire training budget has been + used, if further training does no longer brings significant improvement + to the model. + export_evaluated_data_items (bool): + Whether to export the test set predictions to a BigQuery table. + If False, then the export is not performed. + export_evaluated_data_items_bigquery_destination_uri (string): + Optional. URI of desired destination BigQuery table for exported test set predictions. + + Expected format: + ``bq://::
`` + + If not specified, then results are exported to the following auto-created BigQuery + table: + ``:export_evaluated_examples__.evaluated_examples`` + + Applies only if [export_evaluated_data_items] is True. + export_evaluated_data_items_override_destination (bool): + Whether to override the contents of [export_evaluated_data_items_bigquery_destination_uri], + if the table exists, for exported test set predictions. If False, and the + table exists, then the training job will fail. + + Applies only if [export_evaluated_data_items] is True and + [export_evaluated_data_items_bigquery_destination_uri] is specified. + additional_experiments (List[str]): + Optional. Additional experiment flags for the automl tables training. sync (bool): - Optional. Whether to execute this method synchronously. If False, this method + Whether to execute this method synchronously. If False, this method will be executed in concurrent Future and any downstream object will be immediately returned and synced when the Future has completed. - + create_request_timeout (float): + Optional. The timeout for the create request in seconds. Returns: model: The trained Vertex AI Model resource or None if training did not produce a Vertex AI Model. @@ -4263,19 +4438,16 @@ def run( Raises: RuntimeError: If Training job has already been run or is waiting to run. """ - if model_display_name: utils.validate_display_name(model_display_name) if model_labels: utils.validate_labels(model_labels) if self._is_waiting_to_run(): - raise RuntimeError( - "AutoML Forecasting Training is already scheduled to run." - ) + raise RuntimeError("AutoML Tabular Training is already scheduled to run.") if self._has_run: - raise RuntimeError("AutoML Forecasting Training has already run.") + raise RuntimeError("AutoML Tabular Training has already run.") if additional_experiments: self._add_additional_experiments(additional_experiments) @@ -4283,37 +4455,19 @@ def run( return self._run( dataset=dataset, target_column=target_column, - time_column=time_column, - time_series_identifier_column=time_series_identifier_column, - unavailable_at_forecast_columns=unavailable_at_forecast_columns, - available_at_forecast_columns=available_at_forecast_columns, - forecast_horizon=forecast_horizon, - data_granularity_unit=data_granularity_unit, - data_granularity_count=data_granularity_count, training_fraction_split=training_fraction_split, validation_fraction_split=validation_fraction_split, test_fraction_split=test_fraction_split, predefined_split_column_name=predefined_split_column_name, timestamp_split_column_name=timestamp_split_column_name, weight_column=weight_column, - time_series_attribute_columns=time_series_attribute_columns, - context_window=context_window, budget_milli_node_hours=budget_milli_node_hours, + model_display_name=model_display_name, + model_labels=model_labels, + disable_early_stopping=disable_early_stopping, export_evaluated_data_items=export_evaluated_data_items, export_evaluated_data_items_bigquery_destination_uri=export_evaluated_data_items_bigquery_destination_uri, export_evaluated_data_items_override_destination=export_evaluated_data_items_override_destination, - quantiles=quantiles, - validation_options=validation_options, - model_display_name=model_display_name, - model_labels=model_labels, - hierarchy_group_columns=hierarchy_group_columns, - hierarchy_group_total_weight=hierarchy_group_total_weight, - hierarchy_temporal_total_weight=hierarchy_temporal_total_weight, - hierarchy_group_temporal_total_weight=hierarchy_group_temporal_total_weight, - window_column=window_column, - window_stride_length=window_stride_length, - window_max_count=window_max_count, - holiday_regions=holiday_regions, sync=sync, create_request_timeout=create_request_timeout, ) @@ -4321,39 +4475,21 @@ def run( @base.optional_sync() def _run( self, - dataset: datasets.TimeSeriesDataset, + dataset: datasets.TabularDataset, target_column: str, - time_column: str, - time_series_identifier_column: str, - unavailable_at_forecast_columns: List[str], - available_at_forecast_columns: List[str], - forecast_horizon: int, - data_granularity_unit: str, - data_granularity_count: int, training_fraction_split: Optional[float] = None, validation_fraction_split: Optional[float] = None, test_fraction_split: Optional[float] = None, predefined_split_column_name: Optional[str] = None, timestamp_split_column_name: Optional[str] = None, weight_column: Optional[str] = None, - time_series_attribute_columns: Optional[List[str]] = None, - context_window: Optional[int] = None, - export_evaluated_data_items: bool = False, - export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None, - export_evaluated_data_items_override_destination: bool = False, - quantiles: Optional[List[float]] = None, - validation_options: Optional[str] = None, budget_milli_node_hours: int = 1000, model_display_name: Optional[str] = None, model_labels: Optional[Dict[str, str]] = None, - hierarchy_group_columns: Optional[List[str]] = None, - hierarchy_group_total_weight: Optional[float] = None, - hierarchy_temporal_total_weight: Optional[float] = None, - hierarchy_group_temporal_total_weight: Optional[float] = None, - window_column: Optional[str] = None, - window_stride_length: Optional[int] = None, - window_max_count: Optional[int] = None, - holiday_regions: Optional[List[str]] = None, + disable_early_stopping: bool = False, + export_evaluated_data_items: bool = False, + export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None, + export_evaluated_data_items_override_destination: bool = False, sync: bool = True, create_request_timeout: Optional[float] = None, ) -> models.Model: @@ -4380,42 +4516,16 @@ def _run( Supported only for tabular Datasets. Args: - dataset (datasets.TimeSeriesDataset): + dataset (datasets.TabularDataset): Required. The dataset within the same Project from which data will be used to train the Model. The Dataset must use schema compatible with Model being trained, and what is compatible should be described in the used TrainingPipeline's [training_task_definition] [google.cloud.aiplatform.v1beta1.TrainingPipeline.training_task_definition]. - For time series Datasets, all their data is exported to + For tabular Datasets, all their data is exported to training, to pick and choose from. target_column (str): - Required. Name of the column that the Model is to predict values for. This - column must be unavailable at forecast. - time_column (str): - Required. Name of the column that identifies time order in the time series. - This column must be available at forecast. - time_series_identifier_column (str): - Required. Name of the column that identifies the time series. - unavailable_at_forecast_columns (List[str]): - Required. Column names of columns that are unavailable at forecast. - Each column contains information for the given entity (identified by the - [time_series_identifier_column]) that is unknown before the forecast - (e.g. population of a city in a given year, or weather on a given day). - available_at_forecast_columns (List[str]): - Required. Column names of columns that are available at forecast. - Each column contains information for the given entity (identified by the - [time_series_identifier_column]) that is known at forecast. - forecast_horizon: (int): - Required. The amount of time into the future for which forecasted values for the target are - returned. Expressed in number of units defined by the [data_granularity_unit] and - [data_granularity_count] field. Inclusive. - data_granularity_unit (str): - Required. The data granularity unit. Accepted values are ``minute``, - ``hour``, ``day``, ``week``, ``month``, ``year``. - data_granularity_count (int): - Required. The number of data granularity units between data points in the training - data. If [data_granularity_unit] is `minute`, can be 1, 5, 10, 15, or 30. For all other - values of [data_granularity_unit], must be 1. + Required. The name of the column values of which the Model is to predict. training_fraction_split (float): Optional. The fraction of the input data that is to be used to train the Model. This is ignored if Dataset is not provided. @@ -4451,48 +4561,7 @@ def _run( during Model training. The column must have numeric values between 0 and 10000 inclusively, and 0 value means that the row is ignored. If the weight column field is not set, then all rows are assumed to have - equal weight of 1. This column must be available at forecast. - time_series_attribute_columns (List[str]): - Optional. Column names that should be used as attribute columns. - Each column is constant within a time series. - context_window (int): - Optional. The number of periods offset into the past to restrict past sequence, where each - period is one unit of granularity as defined by [period]. When not provided uses the - default value of 0 which means the model sets each series historical window to be 0 (also - known as "cold start"). Inclusive. - export_evaluated_data_items (bool): - Whether to export the test set predictions to a BigQuery table. - If False, then the export is not performed. - export_evaluated_data_items_bigquery_destination_uri (string): - Optional. URI of desired destination BigQuery table for exported test set predictions. - - Expected format: - ``bq://::
`` - - If not specified, then results are exported to the following auto-created BigQuery - table: - ``:export_evaluated_examples__.evaluated_examples`` - - Applies only if [export_evaluated_data_items] is True. - export_evaluated_data_items_override_destination (bool): - Whether to override the contents of [export_evaluated_data_items_bigquery_destination_uri], - if the table exists, for exported test set predictions. If False, and the - table exists, then the training job will fail. - - Applies only if [export_evaluated_data_items] is True and - [export_evaluated_data_items_bigquery_destination_uri] is specified. - quantiles (List[float]): - Quantiles to use for the `minimize-quantile-loss` - [AutoMLForecastingTrainingJob.optimization_objective]. This argument is required in - this case. - - Accepts up to 5 quantiles in the form of a double from 0 to 1, exclusive. - Each quantile must be unique. - validation_options (str): - Validation options for the data validation component. The available options are: - "fail-pipeline" - (default), will validate against the validation and fail the pipeline - if it fails. - "ignore-validation" - ignore the results of the validation and continue the pipeline + equal weight of 1. budget_milli_node_hours (int): Optional. The train budget of creating this Model, expressed in milli node hours i.e. 1,000 value in this field means 1 node hour. @@ -4521,49 +4590,33 @@ def _run( are allowed. See https://goo.gl/xmQnxf for more information and examples of labels. - hierarchy_group_columns (List[str]): - Optional. A list of time series attribute column names that - define the time series hierarchy. Only one level of hierarchy is - supported, ex. ``region`` for a hierarchy of stores or - ``department`` for a hierarchy of products. If multiple columns - are specified, time series will be grouped by their combined - values, ex. (``blue``, ``large``) for ``color`` and ``size``, up - to 5 columns are accepted. If no group columns are specified, - all time series are considered to be part of the same group. - hierarchy_group_total_weight (float): - Optional. The weight of the loss for predictions aggregated over - time series in the same hierarchy group. - hierarchy_temporal_total_weight (float): - Optional. The weight of the loss for predictions aggregated over - the horizon for a single time series. - hierarchy_group_temporal_total_weight (float): - Optional. The weight of the loss for predictions aggregated over - both the horizon and time series in the same hierarchy group. - window_column (str): - Optional. Name of the column that should be used to filter input - rows. The column should contain either booleans or string - booleans; if the value of the row is True, generate a sliding - window from that row. - window_stride_length (int): - Optional. Step length used to generate input examples. Every - ``window_stride_length`` rows will be used to generate a sliding - window. - window_max_count (int): - Optional. Number of rows that should be used to generate input - examples. If the total row count is larger than this number, the - input data will be randomly sampled to hit the count. - holiday_regions (List[str]): - Optional. The geographical regions to use when creating holiday - features. This option is only allowed when data_granularity_unit - is ``day``. Acceptable values can come from any of the following - levels: - Top level: GLOBAL - Second level: continental regions - NA: North America - JAPAC: Japan and Asia Pacific - EMEA: Europe, the Middle East and Africa - LAC: Latin America and the Caribbean - Third level: countries from ISO 3166-1 Country codes. + disable_early_stopping (bool): + Required. If true, the entire budget is used. This disables the early stopping + feature. By default, the early stopping feature is enabled, which means + that training might stop before the entire training budget has been + used, if further training does no longer brings significant improvement + to the model. + export_evaluated_data_items (bool): + Whether to export the test set predictions to a BigQuery table. + If False, then the export is not performed. + export_evaluated_data_items_bigquery_destination_uri (string): + Optional. URI of desired destination BigQuery table for exported test set predictions. + + Expected format: + ``bq://::
`` + + If not specified, then results are exported to the following auto-created BigQuery + table: + ``:export_evaluated_examples__.evaluated_examples`` + + Applies only if [export_evaluated_data_items] is True. + export_evaluated_data_items_override_destination (bool): + Whether to override the contents of [export_evaluated_data_items_bigquery_destination_uri], + if the table exists, for exported test set predictions. If False, and the + table exists, then the training job will fail. + + Applies only if [export_evaluated_data_items] is True and + [export_evaluated_data_items_bigquery_destination_uri] is specified. sync (bool): Whether to execute this method synchronously. If False, this method will be executed in concurrent Future and any downstream object will @@ -4576,7 +4629,7 @@ def _run( produce a Vertex AI Model. """ - training_task_definition = schema.training_job.definition.automl_forecasting + training_task_definition = schema.training_job.definition.automl_tabular # auto-populate transformations if self._column_transformations is None: @@ -4587,60 +4640,28 @@ def _run( ( self._column_transformations, column_names, - ) = dataset._get_default_column_transformations(target_column) + ) = column_transformations_utils.get_default_column_transformations( + dataset=dataset, target_column=target_column + ) _LOGGER.info( "The column transformation of type 'auto' was set for the following columns: %s." % column_names ) - window_config = self._create_window_config( - column=window_column, - stride_length=window_stride_length, - max_count=window_max_count, - ) - training_task_inputs_dict = { - # required inputs - "targetColumn": target_column, - "timeColumn": time_column, - "timeSeriesIdentifierColumn": time_series_identifier_column, - "timeSeriesAttributeColumns": time_series_attribute_columns, - "unavailableAtForecastColumns": unavailable_at_forecast_columns, - "availableAtForecastColumns": available_at_forecast_columns, - "forecastHorizon": forecast_horizon, - "dataGranularity": { - "unit": data_granularity_unit, - "quantity": data_granularity_count, - }, - "transformations": self._column_transformations, - "trainBudgetMilliNodeHours": budget_milli_node_hours, - # optional inputs - "weightColumn": weight_column, - "contextWindow": context_window, - "quantiles": quantiles, - "validationOptions": validation_options, - "optimizationObjective": self._optimization_objective, - "holidayRegions": holiday_regions, - } - - # TODO(TheMichaelHu): Remove the ifs once the API supports these inputs. - if any( - [ - hierarchy_group_columns, - hierarchy_group_total_weight, - hierarchy_temporal_total_weight, - hierarchy_group_temporal_total_weight, - ] - ): - training_task_inputs_dict["hierarchyConfig"] = { - "groupColumns": hierarchy_group_columns, - "groupTotalWeight": hierarchy_group_total_weight, - "temporalTotalWeight": hierarchy_temporal_total_weight, - "groupTemporalTotalWeight": hierarchy_group_temporal_total_weight, - } - if window_config: - training_task_inputs_dict["windowConfig"] = window_config + # required inputs + "targetColumn": target_column, + "transformations": self._column_transformations, + "trainBudgetMilliNodeHours": budget_milli_node_hours, + # optional inputs + "weightColumnName": weight_column, + "disableEarlyStopping": disable_early_stopping, + "optimizationObjective": self._optimization_objective, + "predictionType": self._optimization_prediction_type, + "optimizationObjectiveRecallValue": self._optimization_objective_recall_value, + "optimizationObjectivePrecisionValue": self._optimization_objective_precision_value, + } final_export_eval_bq_uri = export_evaluated_data_items_bigquery_destination_uri if final_export_eval_bq_uri and not final_export_eval_bq_uri.startswith( @@ -4665,7 +4686,7 @@ def _run( encryption_spec=self._model_encryption_spec, ) - new_model = self._run_job( + return self._run_job( training_task_definition=training_task_definition, training_task_inputs=training_task_inputs_dict, dataset=dataset, @@ -4678,14 +4699,6 @@ def _run( create_request_timeout=create_request_timeout, ) - if export_evaluated_data_items: - _LOGGER.info( - "Exported examples available at:\n%s" - % self.evaluated_data_items_bigquery_uri - ) - - return new_model - @property def _model_upload_fail_string(self) -> str: """Helper property for model upload failure.""" @@ -4694,23 +4707,6 @@ def _model_upload_fail_string(self) -> str: "Model." ) - @property - def evaluated_data_items_bigquery_uri(self) -> Optional[str]: - """BigQuery location of exported evaluated examples from the Training Job - Returns: - str: BigQuery uri for the exported evaluated examples if the export - feature is enabled for training. - None: If the export feature was not enabled for training. - """ - - self._assert_gca_resource_is_available() - - metadata = self._gca_resource.training_task_metadata - if metadata and "evaluatedDataItemsBigqueryUri" in metadata: - return metadata["evaluatedDataItemsBigqueryUri"] - - return None - def _add_additional_experiments(self, additional_experiments: List[str]): """Add experiment flags to the training job. Args: @@ -4720,27 +4716,213 @@ def _add_additional_experiments(self, additional_experiments: List[str]): self._additional_experiments.extend(additional_experiments) @staticmethod - def _create_window_config( - column: Optional[str] = None, - stride_length: Optional[int] = None, - max_count: Optional[int] = None, - ) -> Optional[Dict[str, Union[int, str]]]: - """Creates a window config from training job arguments.""" - configs = { - "column": column, - "strideLength": stride_length, - "maxCount": max_count, - } - present_configs = {k: v for k, v in configs.items() if v is not None} - if not present_configs: - return None - if len(present_configs) > 1: - raise ValueError( - "More than one windowing strategy provided. Make sure only one " - "of window_column, window_stride_length, or window_max_count " - "is specified." - ) - return present_configs + def get_auto_column_specs( + dataset: datasets.TabularDataset, + target_column: str, + ) -> Dict[str, str]: + """Returns a dict with all non-target columns as keys and 'auto' as values. + + Example usage: + + column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs( + dataset=my_dataset, + target_column="my_target_column", + ) + + Args: + dataset (datasets.TabularDataset): + Required. Intended dataset. + target_column(str): + Required. Intended target column. + Returns: + Dict[str, str] + Column names as keys and 'auto' as values + """ + column_names = [ + column for column in dataset.column_names if column != target_column + ] + column_specs = {column: "auto" for column in column_names} + return column_specs + + class column_data_types: + AUTO = "auto" + NUMERIC = "numeric" + CATEGORICAL = "categorical" + TIMESTAMP = "timestamp" + TEXT = "text" + REPEATED_NUMERIC = "repeated_numeric" + REPEATED_CATEGORICAL = "repeated_categorical" + REPEATED_TEXT = "repeated_text" + + +class AutoMLForecastingTrainingJob(_ForecastingTrainingJob): + _model_type = "AutoML" + _training_task_definition = schema.training_job.definition.automl_forecasting + _supported_training_schemas = (schema.training_job.definition.automl_forecasting,) + + def run( + self, + dataset: datasets.TimeSeriesDataset, + target_column: str, + time_column: str, + time_series_identifier_column: str, + unavailable_at_forecast_columns: List[str], + available_at_forecast_columns: List[str], + forecast_horizon: int, + data_granularity_unit: str, + data_granularity_count: int, + training_fraction_split: Optional[float] = None, + validation_fraction_split: Optional[float] = None, + test_fraction_split: Optional[float] = None, + predefined_split_column_name: Optional[str] = None, + timestamp_split_column_name: Optional[str] = None, + weight_column: Optional[str] = None, + time_series_attribute_columns: Optional[List[str]] = None, + context_window: Optional[int] = None, + export_evaluated_data_items: bool = False, + export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None, + export_evaluated_data_items_override_destination: bool = False, + quantiles: Optional[List[float]] = None, + validation_options: Optional[str] = None, + budget_milli_node_hours: int = 1000, + model_display_name: Optional[str] = None, + model_labels: Optional[Dict[str, str]] = None, + additional_experiments: Optional[List[str]] = None, + hierarchy_group_columns: Optional[List[str]] = None, + hierarchy_group_total_weight: Optional[float] = None, + hierarchy_temporal_total_weight: Optional[float] = None, + hierarchy_group_temporal_total_weight: Optional[float] = None, + window_column: Optional[str] = None, + window_stride_length: Optional[int] = None, + window_max_count: Optional[int] = None, + holiday_regions: Optional[List[str]] = None, + sync: bool = True, + create_request_timeout: Optional[float] = None, + ) -> models.Model: + return super().run( + dataset=dataset, + target_column=target_column, + time_column=time_column, + time_series_identifier_column=time_series_identifier_column, + unavailable_at_forecast_columns=unavailable_at_forecast_columns, + available_at_forecast_columns=available_at_forecast_columns, + forecast_horizon=forecast_horizon, + data_granularity_unit=data_granularity_unit, + data_granularity_count=data_granularity_count, + training_fraction_split=training_fraction_split, + validation_fraction_split=validation_fraction_split, + test_fraction_split=test_fraction_split, + predefined_split_column_name=predefined_split_column_name, + timestamp_split_column_name=timestamp_split_column_name, + weight_column=weight_column, + time_series_attribute_columns=time_series_attribute_columns, + context_window=context_window, + budget_milli_node_hours=budget_milli_node_hours, + export_evaluated_data_items=export_evaluated_data_items, + export_evaluated_data_items_bigquery_destination_uri=export_evaluated_data_items_bigquery_destination_uri, + export_evaluated_data_items_override_destination=export_evaluated_data_items_override_destination, + quantiles=quantiles, + validation_options=validation_options, + model_display_name=model_display_name, + model_labels=model_labels, + additional_experiments=additional_experiments, + hierarchy_group_columns=hierarchy_group_columns, + hierarchy_group_total_weight=hierarchy_group_total_weight, + hierarchy_temporal_total_weight=hierarchy_temporal_total_weight, + hierarchy_group_temporal_total_weight=hierarchy_group_temporal_total_weight, + window_column=window_column, + window_stride_length=window_stride_length, + window_max_count=window_max_count, + holiday_regions=holiday_regions, + sync=sync, + create_request_timeout=create_request_timeout, + ) + + +class SequenceToSequencePlusForecastingTrainingJob(_ForecastingTrainingJob): + _model_type = "Seq2Seq" + _training_task_definition = schema.training_job.definition.seq2seq_plus_forecasting + _supported_training_schemas = ( + schema.training_job.definition.seq2seq_plus_forecasting, + ) + + def run( + self, + dataset: datasets.TimeSeriesDataset, + target_column: str, + time_column: str, + time_series_identifier_column: str, + unavailable_at_forecast_columns: List[str], + available_at_forecast_columns: List[str], + forecast_horizon: int, + data_granularity_unit: str, + data_granularity_count: int, + training_fraction_split: Optional[float] = None, + validation_fraction_split: Optional[float] = None, + test_fraction_split: Optional[float] = None, + predefined_split_column_name: Optional[str] = None, + timestamp_split_column_name: Optional[str] = None, + weight_column: Optional[str] = None, + time_series_attribute_columns: Optional[List[str]] = None, + context_window: Optional[int] = None, + export_evaluated_data_items: bool = False, + export_evaluated_data_items_bigquery_destination_uri: Optional[str] = None, + export_evaluated_data_items_override_destination: bool = False, + quantiles: Optional[List[float]] = None, + validation_options: Optional[str] = None, + budget_milli_node_hours: int = 1000, + model_display_name: Optional[str] = None, + model_labels: Optional[Dict[str, str]] = None, + additional_experiments: Optional[List[str]] = None, + hierarchy_group_columns: Optional[List[str]] = None, + hierarchy_group_total_weight: Optional[float] = None, + hierarchy_temporal_total_weight: Optional[float] = None, + hierarchy_group_temporal_total_weight: Optional[float] = None, + window_column: Optional[str] = None, + window_stride_length: Optional[int] = None, + window_max_count: Optional[int] = None, + holiday_regions: Optional[List[str]] = None, + sync: bool = True, + create_request_timeout: Optional[float] = None, + ) -> models.Model: + return super().run( + dataset=dataset, + target_column=target_column, + time_column=time_column, + time_series_identifier_column=time_series_identifier_column, + unavailable_at_forecast_columns=unavailable_at_forecast_columns, + available_at_forecast_columns=available_at_forecast_columns, + forecast_horizon=forecast_horizon, + data_granularity_unit=data_granularity_unit, + data_granularity_count=data_granularity_count, + training_fraction_split=training_fraction_split, + validation_fraction_split=validation_fraction_split, + test_fraction_split=test_fraction_split, + predefined_split_column_name=predefined_split_column_name, + timestamp_split_column_name=timestamp_split_column_name, + weight_column=weight_column, + time_series_attribute_columns=time_series_attribute_columns, + context_window=context_window, + budget_milli_node_hours=budget_milli_node_hours, + export_evaluated_data_items=export_evaluated_data_items, + export_evaluated_data_items_bigquery_destination_uri=export_evaluated_data_items_bigquery_destination_uri, + export_evaluated_data_items_override_destination=export_evaluated_data_items_override_destination, + quantiles=quantiles, + validation_options=validation_options, + model_display_name=model_display_name, + model_labels=model_labels, + additional_experiments=additional_experiments, + hierarchy_group_columns=hierarchy_group_columns, + hierarchy_group_total_weight=hierarchy_group_total_weight, + hierarchy_temporal_total_weight=hierarchy_temporal_total_weight, + hierarchy_group_temporal_total_weight=hierarchy_group_temporal_total_weight, + window_column=window_column, + window_stride_length=window_stride_length, + window_max_count=window_max_count, + holiday_regions=holiday_regions, + sync=sync, + create_request_timeout=create_request_timeout, + ) class AutoMLImageTrainingJob(_TrainingJob): diff --git a/tests/system/aiplatform/test_e2e_forecasting.py b/tests/system/aiplatform/test_e2e_forecasting.py index b0f3e19711..024946b91b 100644 --- a/tests/system/aiplatform/test_e2e_forecasting.py +++ b/tests/system/aiplatform/test_e2e_forecasting.py @@ -16,6 +16,7 @@ # from google.cloud import aiplatform +from google.cloud.aiplatform import training_jobs from google.cloud.aiplatform.compat.types import job_state from google.cloud.aiplatform.compat.types import pipeline_state import pytest @@ -35,12 +36,19 @@ class TestEndToEndForecasting(e2e_base.TestEndToEnd): _temp_prefix = "temp-vertex-sdk-e2e-forecasting" - def test_end_to_end_forecasting(self, shared_state): + @pytest.mark.parametrize( + "training_job", + [ + training_jobs.AutoMLForecastingTrainingJob, + pytest.param( + training_jobs.SequenceToSequencePlusForecastingTrainingJob, + marks=pytest.mark.skip(reason="Seq2Seq not yet released."), + ), + ], + ) + def test_end_to_end_forecasting(self, shared_state, training_job): """Builds a dataset, trains models, and gets batch predictions.""" - ds = None - automl_job = None - automl_model = None - automl_batch_prediction_job = None + resources = [] aiplatform.init( project=e2e_base._PROJECT, @@ -48,14 +56,13 @@ def test_end_to_end_forecasting(self, shared_state): staging_bucket=shared_state["staging_bucket_name"], ) try: - # Create and import to single managed dataset for both training - # jobs. ds = aiplatform.TimeSeriesDataset.create( display_name=self._make_display_name("dataset"), bq_source=[_TRAINING_DATASET_BQ_PATH], sync=False, create_request_timeout=180.0, ) + resources.append(ds) time_column = "date" time_series_identifier_column = "store_name" @@ -68,17 +75,14 @@ def test_end_to_end_forecasting(self, shared_state): "county": "categorical", } - # Define both training jobs - # TODO(humichael): Add seq2seq job. - automl_job = aiplatform.AutoMLForecastingTrainingJob( - display_name=self._make_display_name("train-housing-automl"), + job = training_job( + display_name=self._make_display_name("train-housing-forecasting"), optimization_objective="minimize-rmse", column_specs=column_specs, ) + resources.append(job) - # Kick off both training jobs, AutoML job will take approx one hour - # to run. - automl_model = automl_job.run( + model = job.run( dataset=ds, target_column=target_column, time_column=time_column, @@ -91,13 +95,18 @@ def test_end_to_end_forecasting(self, shared_state): data_granularity_unit="day", data_granularity_count=1, budget_milli_node_hours=1000, - model_display_name=self._make_display_name("automl-liquor-model"), + holiday_regions=["GLOBAL"], + hierarchy_group_total_weight=1, + window_stride_length=1, + model_display_name=self._make_display_name("forecasting-liquor-model"), sync=False, ) + resources.append(model) - automl_batch_prediction_job = automl_model.batch_predict( - job_display_name=self._make_display_name("automl-liquor-model"), + batch_prediction_job = model.batch_predict( + job_display_name=self._make_display_name("forecasting-liquor-model"), instances_format="bigquery", + predictions_format="csv", machine_type="n1-standard-4", bigquery_source=_PREDICTION_DATASET_BQ_PATH, gcs_destination_prefix=( @@ -105,23 +114,11 @@ def test_end_to_end_forecasting(self, shared_state): ), sync=False, ) + resources.append(batch_prediction_job) - automl_batch_prediction_job.wait() - - assert ( - automl_job.state - == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED - ) - assert ( - automl_batch_prediction_job.state - == job_state.JobState.JOB_STATE_SUCCEEDED - ) + batch_prediction_job.wait() + assert job.state == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + assert batch_prediction_job.state == job_state.JobState.JOB_STATE_SUCCEEDED finally: - if ds is not None: - ds.delete() - if automl_job is not None: - automl_job.delete() - if automl_model is not None: - automl_model.delete() - if automl_batch_prediction_job is not None: - automl_batch_prediction_job.delete() + for resource in resources: + resource.delete() diff --git a/tests/unit/aiplatform/test_automl_forecasting_training_jobs.py b/tests/unit/aiplatform/test_automl_forecasting_training_jobs.py index 21ca78da2e..64e85befa6 100644 --- a/tests/unit/aiplatform/test_automl_forecasting_training_jobs.py +++ b/tests/unit/aiplatform/test_automl_forecasting_training_jobs.py @@ -24,7 +24,7 @@ from google.cloud.aiplatform import datasets from google.cloud.aiplatform import initializer from google.cloud.aiplatform import schema -from google.cloud.aiplatform.training_jobs import AutoMLForecastingTrainingJob +from google.cloud.aiplatform import training_jobs from google.cloud.aiplatform.compat.services import ( model_service_client, @@ -266,7 +266,7 @@ def mock_dataset_nontimeseries(): @pytest.mark.usefixtures("google_auth_mock") -class TestAutoMLForecastingTrainingJob: +class TestForecastingTrainingJob: def setup_method(self): importlib.reload(initializer) importlib.reload(aiplatform) @@ -275,6 +275,13 @@ def teardown_method(self): initializer.global_pool.shutdown(wait=True) @pytest.mark.parametrize("sync", [True, False]) + @pytest.mark.parametrize( + "training_job", + [ + training_jobs.AutoMLForecastingTrainingJob, + training_jobs.SequenceToSequencePlusForecastingTrainingJob, + ], + ) def test_run_call_pipeline_service_create( self, mock_pipeline_service_create, @@ -282,10 +289,11 @@ def test_run_call_pipeline_service_create( mock_dataset_time_series, mock_model_service_get, sync, + training_job, ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) - job = AutoMLForecastingTrainingJob( + job = training_job( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, @@ -344,7 +352,7 @@ def test_run_call_pipeline_service_create( true_training_pipeline = gca_training_pipeline.TrainingPipeline( display_name=_TEST_DISPLAY_NAME, labels=_TEST_LABELS, - training_task_definition=schema.training_job.definition.automl_forecasting, + training_task_definition=training_job._training_task_definition, training_task_inputs=_TEST_TRAINING_TASK_INPUTS_WITH_ADDITIONAL_EXPERIMENTS, model_to_upload=true_managed_model, input_data_config=true_input_data_config, @@ -371,6 +379,13 @@ def test_run_call_pipeline_service_create( assert job.state == gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED @pytest.mark.parametrize("sync", [True, False]) + @pytest.mark.parametrize( + "training_job", + [ + training_jobs.AutoMLForecastingTrainingJob, + training_jobs.SequenceToSequencePlusForecastingTrainingJob, + ], + ) def test_run_call_pipeline_service_create_with_timeout( self, mock_pipeline_service_create, @@ -378,10 +393,11 @@ def test_run_call_pipeline_service_create_with_timeout( mock_dataset_time_series, mock_model_service_get, sync, + training_job, ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) - job = AutoMLForecastingTrainingJob( + job = training_job( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, @@ -440,7 +456,7 @@ def test_run_call_pipeline_service_create_with_timeout( true_training_pipeline = gca_training_pipeline.TrainingPipeline( display_name=_TEST_DISPLAY_NAME, labels=_TEST_LABELS, - training_task_definition=schema.training_job.definition.automl_forecasting, + training_task_definition=training_job._training_task_definition, training_task_inputs=_TEST_TRAINING_TASK_INPUTS_WITH_ADDITIONAL_EXPERIMENTS, model_to_upload=true_managed_model, input_data_config=true_input_data_config, @@ -454,16 +470,24 @@ def test_run_call_pipeline_service_create_with_timeout( @pytest.mark.usefixtures("mock_pipeline_service_get") @pytest.mark.parametrize("sync", [True, False]) + @pytest.mark.parametrize( + "training_job", + [ + training_jobs.AutoMLForecastingTrainingJob, + training_jobs.SequenceToSequencePlusForecastingTrainingJob, + ], + ) def test_run_call_pipeline_if_no_model_display_name_nor_model_labels( self, mock_pipeline_service_create, mock_dataset_time_series, mock_model_service_get, sync, + training_job, ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) - job = AutoMLForecastingTrainingJob( + job = training_job( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, @@ -517,7 +541,7 @@ def test_run_call_pipeline_if_no_model_display_name_nor_model_labels( true_training_pipeline = gca_training_pipeline.TrainingPipeline( display_name=_TEST_DISPLAY_NAME, labels=_TEST_LABELS, - training_task_definition=schema.training_job.definition.automl_forecasting, + training_task_definition=training_job._training_task_definition, training_task_inputs=_TEST_TRAINING_TASK_INPUTS, model_to_upload=true_managed_model, input_data_config=true_input_data_config, @@ -531,16 +555,24 @@ def test_run_call_pipeline_if_no_model_display_name_nor_model_labels( @pytest.mark.usefixtures("mock_pipeline_service_get") @pytest.mark.parametrize("sync", [True, False]) + @pytest.mark.parametrize( + "training_job", + [ + training_jobs.AutoMLForecastingTrainingJob, + training_jobs.SequenceToSequencePlusForecastingTrainingJob, + ], + ) def test_run_call_pipeline_if_set_additional_experiments( self, mock_pipeline_service_create, mock_dataset_time_series, mock_model_service_get, sync, + training_job, ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) - job = AutoMLForecastingTrainingJob( + job = training_job( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, @@ -591,7 +623,7 @@ def test_run_call_pipeline_if_set_additional_experiments( true_training_pipeline = gca_training_pipeline.TrainingPipeline( display_name=_TEST_DISPLAY_NAME, - training_task_definition=schema.training_job.definition.automl_forecasting, + training_task_definition=training_job._training_task_definition, training_task_inputs=_TEST_TRAINING_TASK_INPUTS_WITH_ADDITIONAL_EXPERIMENTS, model_to_upload=true_managed_model, input_data_config=true_input_data_config, @@ -609,14 +641,22 @@ def test_run_call_pipeline_if_set_additional_experiments( "mock_model_service_get", ) @pytest.mark.parametrize("sync", [True, False]) + @pytest.mark.parametrize( + "training_job", + [ + training_jobs.AutoMLForecastingTrainingJob, + training_jobs.SequenceToSequencePlusForecastingTrainingJob, + ], + ) def test_run_called_twice_raises( self, mock_dataset_time_series, sync, + training_job, ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) - job = AutoMLForecastingTrainingJob( + job = training_job( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, @@ -686,16 +726,24 @@ def test_run_called_twice_raises( ) @pytest.mark.parametrize("sync", [True, False]) + @pytest.mark.parametrize( + "training_job", + [ + training_jobs.AutoMLForecastingTrainingJob, + training_jobs.SequenceToSequencePlusForecastingTrainingJob, + ], + ) def test_run_raises_if_pipeline_fails( self, mock_pipeline_service_create_and_get_with_fail, mock_dataset_time_series, sync, + training_job, ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) - job = AutoMLForecastingTrainingJob( + job = training_job( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, @@ -739,10 +787,21 @@ def test_run_raises_if_pipeline_fails( with pytest.raises(RuntimeError): job.get_model() - def test_raises_before_run_is_called(self, mock_pipeline_service_create): + @pytest.mark.parametrize( + "training_job", + [ + training_jobs.AutoMLForecastingTrainingJob, + training_jobs.SequenceToSequencePlusForecastingTrainingJob, + ], + ) + def test_raises_before_run_is_called( + self, + mock_pipeline_service_create, + training_job, + ): aiplatform.init(project=_TEST_PROJECT, staging_bucket=_TEST_BUCKET_NAME) - job = AutoMLForecastingTrainingJob( + job = training_job( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, @@ -758,6 +817,13 @@ def test_raises_before_run_is_called(self, mock_pipeline_service_create): job.state @pytest.mark.parametrize("sync", [True, False]) + @pytest.mark.parametrize( + "training_job", + [ + training_jobs.AutoMLForecastingTrainingJob, + training_jobs.SequenceToSequencePlusForecastingTrainingJob, + ], + ) def test_splits_fraction( self, mock_pipeline_service_create, @@ -765,10 +831,11 @@ def test_splits_fraction( mock_dataset_time_series, mock_model_service_get, sync, + training_job, ): """ Initiate aiplatform with encryption key name. - Create and run an AutoML Video Classification training job, verify calls and return value + Create and run an Forecasting training job, verify calls and return value """ aiplatform.init( @@ -776,7 +843,7 @@ def test_splits_fraction( encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) - job = AutoMLForecastingTrainingJob( + job = training_job( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, @@ -838,7 +905,7 @@ def test_splits_fraction( true_training_pipeline = gca_training_pipeline.TrainingPipeline( display_name=_TEST_DISPLAY_NAME, - training_task_definition=schema.training_job.definition.automl_forecasting, + training_task_definition=training_job._training_task_definition, training_task_inputs=_TEST_TRAINING_TASK_INPUTS, model_to_upload=true_managed_model, input_data_config=true_input_data_config, @@ -852,6 +919,13 @@ def test_splits_fraction( ) @pytest.mark.parametrize("sync", [True, False]) + @pytest.mark.parametrize( + "training_job", + [ + training_jobs.AutoMLForecastingTrainingJob, + training_jobs.SequenceToSequencePlusForecastingTrainingJob, + ], + ) def test_splits_timestamp( self, mock_pipeline_service_create, @@ -859,10 +933,11 @@ def test_splits_timestamp( mock_dataset_time_series, mock_model_service_get, sync, + training_job, ): """Initiate aiplatform with encryption key name. - Create and run an AutoML Forecasting training job, verify calls and + Create and run an Forecasting training job, verify calls and return value """ @@ -871,7 +946,7 @@ def test_splits_timestamp( encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) - job = AutoMLForecastingTrainingJob( + job = training_job( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, @@ -934,9 +1009,7 @@ def test_splits_timestamp( true_training_pipeline = gca_training_pipeline.TrainingPipeline( display_name=_TEST_DISPLAY_NAME, - training_task_definition=( - schema.training_job.definition.automl_forecasting - ), + training_task_definition=training_job._training_task_definition, training_task_inputs=_TEST_TRAINING_TASK_INPUTS, model_to_upload=true_managed_model, input_data_config=true_input_data_config, @@ -950,6 +1023,13 @@ def test_splits_timestamp( ) @pytest.mark.parametrize("sync", [True, False]) + @pytest.mark.parametrize( + "training_job", + [ + training_jobs.AutoMLForecastingTrainingJob, + training_jobs.SequenceToSequencePlusForecastingTrainingJob, + ], + ) def test_splits_predefined( self, mock_pipeline_service_create, @@ -957,10 +1037,11 @@ def test_splits_predefined( mock_dataset_time_series, mock_model_service_get, sync, + training_job, ): """ Initiate aiplatform with encryption key name. - Create and run an AutoML Video Classification training job, verify calls and return value + Create and run an Forecasting training job, verify calls and return value """ aiplatform.init( @@ -968,7 +1049,7 @@ def test_splits_predefined( encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) - job = AutoMLForecastingTrainingJob( + job = training_job( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, @@ -1026,7 +1107,7 @@ def test_splits_predefined( true_training_pipeline = gca_training_pipeline.TrainingPipeline( display_name=_TEST_DISPLAY_NAME, - training_task_definition=schema.training_job.definition.automl_forecasting, + training_task_definition=training_job._training_task_definition, training_task_inputs=_TEST_TRAINING_TASK_INPUTS, model_to_upload=true_managed_model, input_data_config=true_input_data_config, @@ -1040,6 +1121,13 @@ def test_splits_predefined( ) @pytest.mark.parametrize("sync", [True, False]) + @pytest.mark.parametrize( + "training_job", + [ + training_jobs.AutoMLForecastingTrainingJob, + training_jobs.SequenceToSequencePlusForecastingTrainingJob, + ], + ) def test_splits_default( self, mock_pipeline_service_create, @@ -1047,10 +1135,11 @@ def test_splits_default( mock_dataset_time_series, mock_model_service_get, sync, + training_job, ): """ Initiate aiplatform with encryption key name. - Create and run an AutoML Video Classification training job, verify calls and return value + Create and run an Forecasting training job, verify calls and return value """ aiplatform.init( @@ -1058,7 +1147,7 @@ def test_splits_default( encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) - job = AutoMLForecastingTrainingJob( + job = training_job( display_name=_TEST_DISPLAY_NAME, optimization_objective=_TEST_TRAINING_OPTIMIZATION_OBJECTIVE_NAME, column_transformations=_TEST_TRAINING_COLUMN_TRANSFORMATIONS, @@ -1110,7 +1199,7 @@ def test_splits_default( true_training_pipeline = gca_training_pipeline.TrainingPipeline( display_name=_TEST_DISPLAY_NAME, - training_task_definition=schema.training_job.definition.automl_forecasting, + training_task_definition=training_job._training_task_definition, training_task_inputs=_TEST_TRAINING_TASK_INPUTS, model_to_upload=true_managed_model, input_data_config=true_input_data_config,