From c3aeb9219664196e57209888f435bd6a08824074 Mon Sep 17 00:00:00 2001 From: Zhenhua Li Date: Mon, 30 Sep 2024 10:11:33 -0700 Subject: [PATCH] Add resource Dataproc batch (#11750) --- mmv1/products/dataproc/Batch.yaml | 539 ++++++++++++++++++ mmv1/products/dataproc/product.yaml | 2 - .../constants/cloud_dataproc_batch.go.tmpl | 15 + .../decoders/cloud_dataproc_batch.go.tmpl | 37 ++ .../examples/dataproc_batch_pyspark.tf.tmpl | 28 + .../examples/dataproc_batch_spark.tf.tmpl | 25 + .../dataproc_batch_spark_full.tf.tmpl | 114 ++++ .../examples/dataproc_batch_sparkr.tf.tmpl | 24 + .../examples/dataproc_batch_sparksql.tf.tmpl | 24 + .../terraform/acctest/bootstrap_test_utils.go | 27 +- .../dataproc/resource_dataproc_batch_test.go | 59 ++ 11 files changed, 890 insertions(+), 4 deletions(-) create mode 100644 mmv1/products/dataproc/Batch.yaml create mode 100644 mmv1/templates/terraform/constants/cloud_dataproc_batch.go.tmpl create mode 100644 mmv1/templates/terraform/decoders/cloud_dataproc_batch.go.tmpl create mode 100644 mmv1/templates/terraform/examples/dataproc_batch_pyspark.tf.tmpl create mode 100644 mmv1/templates/terraform/examples/dataproc_batch_spark.tf.tmpl create mode 100644 mmv1/templates/terraform/examples/dataproc_batch_spark_full.tf.tmpl create mode 100644 mmv1/templates/terraform/examples/dataproc_batch_sparkr.tf.tmpl create mode 100644 mmv1/templates/terraform/examples/dataproc_batch_sparksql.tf.tmpl create mode 100644 mmv1/third_party/terraform/services/dataproc/resource_dataproc_batch_test.go diff --git a/mmv1/products/dataproc/Batch.yaml b/mmv1/products/dataproc/Batch.yaml new file mode 100644 index 000000000000..febffeeb45b8 --- /dev/null +++ b/mmv1/products/dataproc/Batch.yaml @@ -0,0 +1,539 @@ +# Copyright 2024 Google Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +name: 'Batch' +description: | + Dataproc Serverless Batches lets you run Spark workloads without requiring you to + provision and manage your own Dataproc cluster. +references: + guides: + 'Dataproc Serverless Batches Intro': 'https://cloud.google.com/dataproc-serverless/docs/overview' + api: 'https://cloud.google.com/dataproc-serverless/docs/reference/rest/v1/projects.locations.batches' +docs: +id_format: 'projects/{{project}}/locations/{{location}}/batches/{{batch_id}}' +base_url: 'projects/{{project}}/locations/{{location}}/batches' +self_link: 'projects/{{project}}/locations/{{location}}/batches/{{batch_id}}' +create_url: 'projects/{{project}}/locations/{{location}}/batches?batchId={{batch_id}}' +delete_url: 'projects/{{project}}/locations/{{location}}/batches/{{batch_id}}' +immutable: true +import_format: + - 'projects/{{project}}/locations/{{location}}/batches/{{batch_id}}' +timeouts: + insert_minutes: 10 + update_minutes: 20 + delete_minutes: 5 +autogen_async: true +async: + actions: ['create', 'delete', 'update'] + type: 'OpAsync' + operation: + base_url: '{{op_id}}' + result: + resource_inside_response: false +collection_url_key: 'batches' +custom_code: + constants: 'templates/terraform/constants/cloud_dataproc_batch.go.tmpl' + decoder: 'templates/terraform/decoders/cloud_dataproc_batch.go.tmpl' +examples: + - name: 'dataproc_batch_spark' + primary_resource_id: 'example_batch_spark' + primary_resource_name: 'fmt.Sprintf("tf-test-spark-batch%s", context["random_suffix"])' + vars: + subnetwork_name: 'default' + prevent_destroy: 'true' + test_env_vars: + project_name: 'PROJECT_NAME' + test_vars_overrides: + 'subnetwork_name': 'acctest.BootstrapSubnetWithFirewallForDataprocBatches(t, "dataproc-spark-test-network", "dataproc-spark-test-subnetwork")' + 'prevent_destroy': 'false' + ignore_read_extra: + - 'runtime_config.0.properties' + - name: 'dataproc_batch_spark_full' + primary_resource_id: 'example_batch_spark' + primary_resource_name: 'fmt.Sprintf("tf-test-spark-batch%s", context["random_suffix"])' + vars: + dataproc_batch: 'dataproc-batch' + prevent_destroy: 'true' + key_name: 'example-key' + keyring_name: 'example-keyring' + bucket_name: 'dataproc-bucket' + test_env_vars: + project_name: 'PROJECT_NAME' + test_vars_overrides: + 'prevent_destroy': 'false' + ignore_read_extra: + - 'runtime_config.0.properties' + - name: 'dataproc_batch_sparksql' + primary_resource_id: 'example_batch_sparsql' + primary_resource_name: 'fmt.Sprintf("tf-test-spark-batch%s", context["random_suffix"])' + vars: + subnetwork_name: 'default' + prevent_destroy: 'true' + test_env_vars: + project_name: 'PROJECT_NAME' + test_vars_overrides: + 'subnetwork_name': 'acctest.BootstrapSubnetWithFirewallForDataprocBatches(t, "dataproc-sparksql-test-network", "dataproc-sparksql-test-subnetwork")' + 'prevent_destroy': 'false' + ignore_read_extra: + - 'runtime_config.0.properties' + - name: 'dataproc_batch_pyspark' + primary_resource_id: 'example_batch_pyspark' + primary_resource_name: 'fmt.Sprintf("tf-test-spark-batch%s", context["random_suffix"])' + vars: + subnetwork_name: 'default' + prevent_destroy: 'true' + test_env_vars: + project_name: 'PROJECT_NAME' + test_vars_overrides: + 'subnetwork_name': 'acctest.BootstrapSubnetWithFirewallForDataprocBatches(t, "dataproc-pyspark-test-network", "dataproc-pyspark-test-subnetwork")' + 'prevent_destroy': 'false' + ignore_read_extra: + - 'runtime_config.0.properties' + - name: 'dataproc_batch_sparkr' + primary_resource_id: 'example_batch_sparkr' + primary_resource_name: 'fmt.Sprintf("tf-test-spark-batch%s", context["random_suffix"])' + vars: + subnetwork_name: 'default' + prevent_destroy: 'true' + test_env_vars: + project_name: 'PROJECT_NAME' + test_vars_overrides: + 'subnetwork_name': 'acctest.BootstrapSubnetWithFirewallForDataprocBatches(t, "dataproc-pyspark-test-network", "dataproc-pyspark-test-subnetwork")' + 'prevent_destroy': 'false' + ignore_read_extra: + - 'runtime_config.0.properties' +parameters: + - name: 'location' + type: String + description: | + The location in which the batch will be created in. + url_param_only: true + immutable: true + - name: 'batchId' + type: String + description: | + The ID to use for the batch, which will become the final component of the batch's resource name. + This value must be 4-63 characters. Valid characters are /[a-z][0-9]-/. + url_param_only: true + immutable: true +properties: + - name: 'name' + type: String + description: | + The resource name of the batch. + output: true + - name: 'uuid' + type: String + description: | + A batch UUID (Unique Universal Identifier). The service generates this value when it creates the batch. + output: true + - name: 'createTime' + type: String + description: | + The time when the batch was created. + output: true + - name: 'runtimeInfo' + type: NestedObject + description: 'Runtime information about batch execution.' + output: true + properties: + - name: 'outputUri' + type: String + description: | + A URI pointing to the location of the stdout and stderr of the workload. + output: true + - name: 'diagnosticOutputUri' + type: String + description: | + A URI pointing to the location of the diagnostics tarball. + output: true + - name: 'endpoints' + type: KeyValuePairs + description: | + Map of remote access endpoints (such as web interfaces and APIs) to their URIs. + output: true + - name: 'approximateUsage' + type: NestedObject + description: | + Approximate workload resource usage, calculated when the workload completes(see [Dataproc Serverless pricing](https://cloud.google.com/dataproc-serverless/pricing)) + output: true + properties: + - name: 'milliDcuSeconds' + type: String + description: | + DCU (Dataproc Compute Units) usage in (milliDCU x seconds) + output: true + - name: 'shuffleStorageGbSeconds' + type: String + description: | + Shuffle storage usage in (GB x seconds) + output: true + - name: 'milliAcceleratorSeconds' + type: String + description: | + Accelerator usage in (milliAccelerator x seconds) + output: true + - name: 'acceleratorType' + type: String + description: | + Accelerator type being used, if any + output: true + - name: 'currentUsage' + type: NestedObject + description: | + Snapshot of current workload resource usage(see [Dataproc Serverless pricing](https://cloud.google.com/dataproc-serverless/pricing)) + output: true + properties: + - name: 'milliDcu' + type: String + description: | + Milli (one-thousandth) Dataproc Compute Units (DCUs). + output: true + - name: 'shuffleStorageGb' + type: String + description: | + Shuffle Storage in gigabytes (GB). + output: true + - name: 'milliDcuPremium' + type: String + description: | + Milli (one-thousandth) Dataproc Compute Units (DCUs) charged at premium tier. + output: true + - name: 'shuffleStorageGbPremium' + type: String + description: | + Shuffle Storage in gigabytes (GB) charged at premium tier. + output: true + - name: 'milliAccelerator' + type: String + description: | + Milli (one-thousandth) accelerator.. + output: true + - name: 'acceleratorType' + type: String + description: | + Accelerator type being used, if any. + output: true + - name: 'snapshotTime' + type: String + description: | + The timestamp of the usage snapshot. + output: true + - name: 'state' + type: String + description: | + The state of the batch. For possible values, see the [API documentation](https://cloud.google.com/dataproc-serverless/docs/reference/rest/v1/projects.locations.batches#State). + output: true + - name: 'stateMessage' + type: String + description: | + Batch state details, such as a failure description if the state is FAILED. + output: true + - name: 'stateTime' + type: String + description: | + Batch state details, such as a failure description if the state is FAILED. + output: true + - name: 'creator' + type: String + description: | + The email address of the user who created the batch. + output: true + - name: 'labels' + type: KeyValueLabels + description: | + The labels to associate with this batch. + - name: 'runtimeConfig' + type: NestedObject + description: | + Runtime configuration for the batch execution. + properties: + - name: 'version' + type: String + description: | + Version of the batch runtime. + default_from_api: true + diff_suppress_func: 'CloudDataprocBatchRuntimeConfigVersionDiffSuppress' + - name: 'containerImage' + type: String + description: | + Optional custom container image for the job runtime environment. If not specified, a default container image will be used. + - name: 'properties' + type: KeyValuePairs + description: | + A mapping of property names to values, which are used to configure workload execution. + - name: 'effective_properties' + type: KeyValuePairs + description: | + A mapping of property names to values, which are used to configure workload execution. + output: true + - name: 'environmentConfig' + type: NestedObject + description: | + Environment configuration for the batch execution. + properties: + - name: 'executionConfig' + type: NestedObject + description: | + Execution configuration for a workload. + properties: + - name: 'serviceAccount' + type: String + description: | + Service account that used to execute workload. + default_from_api: true + - name: 'networkTags' + type: Array + description: | + Tags used for network traffic control. + item_type: + type: String + - name: 'kmsKey' + type: String + description: | + The Cloud KMS key to use for encryption. + - name: 'ttl' + type: String + description: | + The duration after which the workload will be terminated. + When the workload exceeds this duration, it will be unconditionally terminated without waiting for ongoing + work to finish. If ttl is not specified for a batch workload, the workload will be allowed to run until it + exits naturally (or run forever without exiting). If ttl is not specified for an interactive session, + it defaults to 24 hours. If ttl is not specified for a batch that uses 2.1+ runtime version, it defaults to 4 hours. + Minimum value is 10 minutes; maximum value is 14 days. If both ttl and idleTtl are specified (for an interactive session), + the conditions are treated as OR conditions: the workload will be terminated when it has been idle for idleTtl or + when ttl has been exceeded, whichever occurs first. + default_from_api: true + - name: 'stagingBucket' + type: String + description: | + A Cloud Storage bucket used to stage workload dependencies, config files, and store + workload output and other ephemeral data, such as Spark history files. If you do not specify a staging bucket, + Cloud Dataproc will determine a Cloud Storage location according to the region where your workload is running, + and then create and manage project-level, per-location staging and temporary buckets. + This field requires a Cloud Storage bucket name, not a gs://... URI to a Cloud Storage bucket. + - name: 'networkUri' + type: String + description: | + Network configuration for workload execution. + conflicts: + - environment_config.0.execution_config.0.subnetwork_uri + - name: 'subnetworkUri' + type: String + description: | + Subnetwork configuration for workload execution. + conflicts: + - environment_config.0.execution_config.0.network_uri + - name: 'peripheralsConfig' + type: NestedObject + description: | + Peripherals configuration that workload has access to. + default_from_api: true + allow_empty_object: true + properties: + - name: 'metastoreService' + type: String + description: | + Resource name of an existing Dataproc Metastore service. + - name: 'sparkHistoryServerConfig' + type: NestedObject + description: | + The Spark History Server configuration for the workload. + properties: + - name: 'dataprocCluster' + type: String + description: | + Resource name of an existing Dataproc Cluster to act as a Spark History Server for the workload. + - name: 'operation' + type: String + description: | + The resource name of the operation associated with this batch. + output: true + - name: 'stateHistory' + type: Array + description: | + Historical state information for the batch. + output: true + item_type: + type: NestedObject + properties: + - name: 'state' + type: String + description: | + The state of the batch at this point in history. For possible values, see the [API documentation](https://cloud.google.com/dataproc-serverless/docs/reference/rest/v1/projects.locations.batches#State). + output: true + - name: 'stateMessage' + type: String + description: | + Details about the state at this point in history. + output: true + - name: 'stateStartTime' + type: String + description: | + The time when the batch entered the historical state. + output: true + - name: 'pysparkBatch' + type: NestedObject + description: | + PySpark batch config. + exactly_one_of: + - 'pyspark_batch' + - 'spark_batch' + - 'spark_sql_batch' + - 'spark_r_batch' + properties: + - name: 'mainPythonFileUri' + type: String + description: | + The HCFS URI of the main Python file to use as the Spark driver. Must be a .py file. + - name: 'args' + type: Array + description: | + The arguments to pass to the driver. Do not include arguments that can be set as batch + properties, such as --conf, since a collision can occur that causes an incorrect batch submission. + item_type: + type: String + - name: 'pythonFileUris' + type: Array + description: | + HCFS file URIs of Python files to pass to the PySpark framework. + Supported file types: .py, .egg, and .zip. + item_type: + type: String + - name: 'jarFileUris' + type: Array + description: | + HCFS URIs of jar files to add to the classpath of the Spark driver and tasks. + item_type: + type: String + - name: 'fileUris' + type: Array + description: | + HCFS URIs of files to be placed in the working directory of each executor. + item_type: + type: String + - name: 'archiveUris' + type: Array + description: | + HCFS URIs of archives to be extracted into the working directory of each executor. + Supported file types: .jar, .tar, .tar.gz, .tgz, and .zip. + item_type: + type: String + - name: 'sparkBatch' + type: NestedObject + description: | + Spark batch config. + exactly_one_of: + - 'pyspark_batch' + - 'spark_batch' + - 'spark_sql_batch' + - 'spark_r_batch' + properties: + - name: 'args' + type: Array + description: | + The arguments to pass to the driver. Do not include arguments that can be set as batch + properties, such as --conf, since a collision can occur that causes an incorrect batch submission. + item_type: + type: String + - name: 'jarFileUris' + type: Array + description: | + HCFS URIs of jar files to add to the classpath of the Spark driver and tasks. + item_type: + type: String + - name: 'fileUris' + type: Array + description: | + HCFS URIs of files to be placed in the working directory of each executor. + item_type: + type: String + - name: 'archiveUris' + type: Array + description: | + HCFS URIs of archives to be extracted into the working directory of each executor. + Supported file types: .jar, .tar, .tar.gz, .tgz, and .zip. + item_type: + type: String + - name: 'mainJarFileUri' + type: String + description: | + The HCFS URI of the jar file that contains the main class. + exactly_one_of: + - 'spark_batch.0.main_class' + - name: 'mainClass' + type: String + description: | + The name of the driver main class. The jar file that contains the class must be in the + classpath or specified in jarFileUris. + exactly_one_of: + - 'spark_batch.0.main_jar_file_uri' + - name: 'sparkRBatch' + type: NestedObject + description: | + SparkR batch config. + exactly_one_of: + - 'pyspark_batch' + - 'spark_batch' + - 'spark_sql_batch' + - 'spark_r_batch' + properties: + - name: 'mainRFileUri' + type: String + description: | + The HCFS URI of the main R file to use as the driver. Must be a .R or .r file. + - name: 'args' + type: Array + description: | + The arguments to pass to the driver. Do not include arguments that can be set as batch + properties, such as --conf, since a collision can occur that causes an incorrect batch submission. + item_type: + type: String + - name: 'fileUris' + type: Array + description: | + HCFS URIs of files to be placed in the working directory of each executor. + item_type: + type: String + - name: 'archiveUris' + type: Array + description: | + HCFS URIs of archives to be extracted into the working directory of each executor. + Supported file types: .jar, .tar, .tar.gz, .tgz, and .zip. + item_type: + type: String + - name: 'sparkSqlBatch' + type: NestedObject + description: | + Spark SQL batch config. + exactly_one_of: + - 'pyspark_batch' + - 'spark_batch' + - 'spark_sql_batch' + - 'spark_r_batch' + properties: + - name: 'queryFileUri' + type: String + description: | + The HCFS URI of the script that contains Spark SQL queries to execute. + - name: 'jarFileUris' + type: Array + description: | + HCFS URIs of jar files to be added to the Spark CLASSPATH. + item_type: + type: String + - name: 'queryVariables' + type: KeyValuePairs + description: | + Mapping of query variable names to values (equivalent to the Spark SQL command: SET name="value";). diff --git a/mmv1/products/dataproc/product.yaml b/mmv1/products/dataproc/product.yaml index dc18983cd0db..d471a9673cc0 100644 --- a/mmv1/products/dataproc/product.yaml +++ b/mmv1/products/dataproc/product.yaml @@ -17,7 +17,5 @@ display_name: 'Dataproc' versions: - name: 'ga' base_url: 'https://dataproc.googleapis.com/v1/' - - name: 'beta' - base_url: 'https://dataproc.googleapis.com/v1beta2/' scopes: - 'https://www.googleapis.com/auth/cloud-identity' diff --git a/mmv1/templates/terraform/constants/cloud_dataproc_batch.go.tmpl b/mmv1/templates/terraform/constants/cloud_dataproc_batch.go.tmpl new file mode 100644 index 000000000000..81374685b694 --- /dev/null +++ b/mmv1/templates/terraform/constants/cloud_dataproc_batch.go.tmpl @@ -0,0 +1,15 @@ +/* + * Dataproc Batch api apends subminor version to the provided + * version. We are suppressing this server generated subminor. + */ +func CloudDataprocBatchRuntimeConfigVersionDiffSuppressFunc(old, new string) bool { + if old != "" && strings.HasPrefix(new, old) || (new != "" && strings.HasPrefix(old, new)) { + return true + } + + return old == new +} + +func CloudDataprocBatchRuntimeConfigVersionDiffSuppress(_, old, new string, d *schema.ResourceData) bool { + return CloudDataprocBatchRuntimeConfigVersionDiffSuppressFunc(old, new) +} \ No newline at end of file diff --git a/mmv1/templates/terraform/decoders/cloud_dataproc_batch.go.tmpl b/mmv1/templates/terraform/decoders/cloud_dataproc_batch.go.tmpl new file mode 100644 index 000000000000..e256d59ffd7e --- /dev/null +++ b/mmv1/templates/terraform/decoders/cloud_dataproc_batch.go.tmpl @@ -0,0 +1,37 @@ +{{/* + The license inside this block applies to this file + Copyright 2024 Google Inc. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ -}} +if obj1, ok := res["runtimeConfig"]; ok { + if rconfig, ok := obj1.(map[string]interface{}); ok { + if obj2, ok := rconfig["properties"]; ok { + if properties, ok := obj2.(map[string]interface{}); ok { + // Update effective_properties to include both server set and client set properties + propertiesCopy := make(map[string]interface{}) + for k, v := range properties { + propertiesCopy[k] = v + } + rconfig["effectiveProperties"] = propertiesCopy + + // Update properties back to original client set properties + originalPropertiesCopy := make(map[string]interface{}) + originalProperties := d.Get("runtime_config.0.properties").(interface{}).(map[string]interface{}) + for k, v := range originalProperties { + originalPropertiesCopy[k] = v + } + rconfig["properties"] = originalPropertiesCopy + return res, nil + } + } + } +} + +return res, nil diff --git a/mmv1/templates/terraform/examples/dataproc_batch_pyspark.tf.tmpl b/mmv1/templates/terraform/examples/dataproc_batch_pyspark.tf.tmpl new file mode 100644 index 000000000000..6dece108007d --- /dev/null +++ b/mmv1/templates/terraform/examples/dataproc_batch_pyspark.tf.tmpl @@ -0,0 +1,28 @@ +resource "google_dataproc_batch" "{{$.PrimaryResourceId}}" { + batch_id = "tf-test-batch%{random_suffix}" + location = "us-central1" + + runtime_config { + properties = { "spark.dynamicAllocation.enabled": "false", "spark.executor.instances": "2" } + } + + environment_config { + execution_config { + subnetwork_uri = "{{index $.Vars "subnetwork_name"}}" + } + } + + pyspark_batch { + main_python_file_uri = "https://storage.googleapis.com/terraform-batches/test_util.py" + args = ["10"] + jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"] + python_file_uris = ["gs://dataproc-examples/pyspark/hello-world/hello-world.py"] + archive_uris = [ + "https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked", + "https://storage.googleapis.com/terraform-batches/animals.txt.jar", + "https://storage.googleapis.com/terraform-batches/animals.txt" + ] + file_uris = ["https://storage.googleapis.com/terraform-batches/people.txt"] + } +} + diff --git a/mmv1/templates/terraform/examples/dataproc_batch_spark.tf.tmpl b/mmv1/templates/terraform/examples/dataproc_batch_spark.tf.tmpl new file mode 100644 index 000000000000..defea3e807bf --- /dev/null +++ b/mmv1/templates/terraform/examples/dataproc_batch_spark.tf.tmpl @@ -0,0 +1,25 @@ +resource "google_dataproc_batch" "{{$.PrimaryResourceId}}" { + + batch_id = "tf-test-batch%{random_suffix}" + location = "us-central1" + labels = {"batch_test": "terraform"} + + runtime_config { + properties = { "spark.dynamicAllocation.enabled": "false", "spark.executor.instances": "2" } + } + + environment_config { + execution_config { + subnetwork_uri = "{{index $.Vars "subnetwork_name"}}" + ttl = "3600s" + network_tags = ["tag1"] + } + } + + spark_batch { + main_class = "org.apache.spark.examples.SparkPi" + args = ["10"] + jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"] + } +} + diff --git a/mmv1/templates/terraform/examples/dataproc_batch_spark_full.tf.tmpl b/mmv1/templates/terraform/examples/dataproc_batch_spark_full.tf.tmpl new file mode 100644 index 000000000000..da010d11f76a --- /dev/null +++ b/mmv1/templates/terraform/examples/dataproc_batch_spark_full.tf.tmpl @@ -0,0 +1,114 @@ +data "google_project" "project" { +} + +data "google_storage_project_service_account" "gcs_account" { +} + +resource "google_dataproc_batch" "{{$.PrimaryResourceId}}" { + batch_id = "{{index $.Vars "dataproc_batch"}}" + location = "us-central1" + labels = {"batch_test": "terraform"} + + runtime_config { + properties = { "spark.dynamicAllocation.enabled": "false", "spark.executor.instances": "2" } + version = "2.2" + } + + environment_config { + execution_config { + ttl = "3600s" + network_tags = ["tag1"] + kms_key = google_kms_crypto_key.crypto_key.id + network_uri = "default" + service_account = "${data.google_project.project.number}-compute@developer.gserviceaccount.com" + staging_bucket = google_storage_bucket.bucket.name + } + peripherals_config { + metastore_service = google_dataproc_metastore_service.ms.name + spark_history_server_config { + dataproc_cluster = google_dataproc_cluster.basic.id + } + } + } + + spark_batch { + main_class = "org.apache.spark.examples.SparkPi" + args = ["10"] + jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"] + } + + depends_on = [ + google_kms_crypto_key_iam_member.crypto_key_member_1, + ] +} + +resource "google_storage_bucket" "bucket" { + uniform_bucket_level_access = true + name = "{{index $.Vars "bucket_name"}}" + location = "US" + force_destroy = true +} + +resource "google_kms_crypto_key" "crypto_key" { + name = "{{index $.Vars "key_name"}}" + key_ring = google_kms_key_ring.key_ring.id + purpose = "ENCRYPT_DECRYPT" +} + +resource "google_kms_key_ring" "key_ring" { + name = "{{index $.Vars "keyring_name"}}" + location = "us-central1" +} + +resource "google_kms_crypto_key_iam_member" "crypto_key_member_1" { + crypto_key_id = google_kms_crypto_key.crypto_key.id + role = "roles/cloudkms.cryptoKeyEncrypterDecrypter" + member = "serviceAccount:service-${data.google_project.project.number}@dataproc-accounts.iam.gserviceaccount.com" +} + +resource "google_dataproc_cluster" "basic" { + name = "{{index $.Vars "dataproc_batch"}}" + region = "us-central1" + + cluster_config { + # Keep the costs down with smallest config we can get away with + software_config { + override_properties = { + "dataproc:dataproc.allow.zero.workers" = "true" + "spark:spark.history.fs.logDirectory" = "gs://${google_storage_bucket.bucket.name}/*/spark-job-history" + } + } + + endpoint_config { + enable_http_port_access = true + } + + master_config { + num_instances = 1 + machine_type = "e2-standard-2" + disk_config { + boot_disk_size_gb = 35 + } + } + + metastore_config { + dataproc_metastore_service = google_dataproc_metastore_service.ms.name + } + } +} + + resource "google_dataproc_metastore_service" "ms" { + service_id = "{{index $.Vars "dataproc_batch"}}" + location = "us-central1" + port = 9080 + tier = "DEVELOPER" + + maintenance_window { + hour_of_day = 2 + day_of_week = "SUNDAY" + } + + hive_metastore_config { + version = "3.1.2" + } +} \ No newline at end of file diff --git a/mmv1/templates/terraform/examples/dataproc_batch_sparkr.tf.tmpl b/mmv1/templates/terraform/examples/dataproc_batch_sparkr.tf.tmpl new file mode 100644 index 000000000000..2234d417e9c5 --- /dev/null +++ b/mmv1/templates/terraform/examples/dataproc_batch_sparkr.tf.tmpl @@ -0,0 +1,24 @@ +resource "google_dataproc_batch" "{{$.PrimaryResourceId}}" { + + batch_id = "tf-test-batch%{random_suffix}" + location = "us-central1" + labels = {"batch_test": "terraform"} + + runtime_config { + properties = { "spark.dynamicAllocation.enabled": "false", "spark.executor.instances": "2" } + } + + environment_config { + execution_config { + subnetwork_uri = "{{index $.Vars "subnetwork_name"}}" + ttl = "3600s" + network_tags = ["tag1"] + } + } + + spark_r_batch { + main_r_file_uri = "https://storage.googleapis.com/terraform-batches/spark-r-flights.r" + args = ["https://storage.googleapis.com/terraform-batches/flights.csv"] + } +} + diff --git a/mmv1/templates/terraform/examples/dataproc_batch_sparksql.tf.tmpl b/mmv1/templates/terraform/examples/dataproc_batch_sparksql.tf.tmpl new file mode 100644 index 000000000000..e821a22377bf --- /dev/null +++ b/mmv1/templates/terraform/examples/dataproc_batch_sparksql.tf.tmpl @@ -0,0 +1,24 @@ +resource "google_dataproc_batch" "{{$.PrimaryResourceId}}" { + + batch_id = "tf-test-batch%{random_suffix}" + location = "us-central1" + + runtime_config { + properties = { "spark.dynamicAllocation.enabled": "false", "spark.executor.instances": "2" } + } + + environment_config { + execution_config { + subnetwork_uri = "{{index $.Vars "subnetwork_name"}}" + } + } + + spark_sql_batch { + query_file_uri = "gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql" + jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"] + query_variables = { + name = "value" + } + } +} + diff --git a/mmv1/third_party/terraform/acctest/bootstrap_test_utils.go b/mmv1/third_party/terraform/acctest/bootstrap_test_utils.go index 17b15a699a63..1c5d4cc04519 100644 --- a/mmv1/third_party/terraform/acctest/bootstrap_test_utils.go +++ b/mmv1/third_party/terraform/acctest/bootstrap_test_utils.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "log" + "maps" "os" "strings" "testing" @@ -910,7 +911,25 @@ func BootstrapSharedCaPoolInLocation(t *testing.T, location string) string { return poolName } +func BootstrapSubnetForDataprocBatches(t *testing.T, subnetName string, networkName string) string { + subnetOptions := map[string]interface{}{ + "privateIpGoogleAccess": true, + } + return BootstrapSubnetWithOverrides(t, subnetName, networkName, subnetOptions) +} + func BootstrapSubnet(t *testing.T, subnetName string, networkName string) string { + return BootstrapSubnetWithOverrides(t, subnetName, networkName, make(map[string]interface{})) +} + +func BootstrapSubnetWithFirewallForDataprocBatches(t *testing.T, testId string, subnetName string) string { + networkName := BootstrapSharedTestNetwork(t, testId) + subnetworkName := BootstrapSubnetForDataprocBatches(t, subnetName, networkName) + BootstrapFirewallForDataprocSharedNetwork(t, subnetName, networkName) + return subnetworkName +} + +func BootstrapSubnetWithOverrides(t *testing.T, subnetName string, networkName string, subnetOptions map[string]interface{}) string { projectID := envvar.GetTestProjectFromEnv() region := envvar.GetTestRegionFromEnv() @@ -932,20 +951,24 @@ func BootstrapSubnet(t *testing.T, subnetName string, networkName string) string networkUrl := fmt.Sprintf("%sprojects/%s/global/networks/%s", config.ComputeBasePath, projectID, networkName) url := fmt.Sprintf("%sprojects/%s/regions/%s/subnetworks", config.ComputeBasePath, projectID, region) - subnetObj := map[string]interface{}{ + defaultSubnetObj := map[string]interface{}{ "name": subnetName, "region ": region, "network": networkUrl, "ipCidrRange": "10.77.0.0/20", } + if len(subnetOptions) != 0 { + maps.Copy(defaultSubnetObj, subnetOptions) + } + res, err := transport_tpg.SendRequest(transport_tpg.SendRequestOptions{ Config: config, Method: "POST", Project: projectID, RawURL: url, UserAgent: config.UserAgent, - Body: subnetObj, + Body: defaultSubnetObj, Timeout: 4 * time.Minute, }) diff --git a/mmv1/third_party/terraform/services/dataproc/resource_dataproc_batch_test.go b/mmv1/third_party/terraform/services/dataproc/resource_dataproc_batch_test.go new file mode 100644 index 000000000000..9d8e37f801c5 --- /dev/null +++ b/mmv1/third_party/terraform/services/dataproc/resource_dataproc_batch_test.go @@ -0,0 +1,59 @@ +package dataproc + +import ( + "testing" +) + +func TestCloudDataprocBatchRuntimeConfigVersionDiffSuppress(t *testing.T) { + cases := map[string]struct { + Old, New string + ExpectDiffSuppress bool + }{ + "old version is empty, new version has a value": { + Old: "", + New: "2.2.100", + ExpectDiffSuppress: false, + }, + "old version is the prefix of the new version": { + Old: "2.2", + New: "2.2.100", + ExpectDiffSuppress: true, + }, + "old version is not the prefix of the new version": { + Old: "2.1", + New: "2.2.100", + ExpectDiffSuppress: false, + }, + "new version is empty, old version has a value": { + Old: "2.2.100", + New: "", + ExpectDiffSuppress: false, + }, + "new version is the prefix of the old version": { + Old: "2.2.100", + New: "2.2", + ExpectDiffSuppress: true, + }, + "new version is not the prefix of the old version": { + Old: "2.2.100", + New: "2.1", + ExpectDiffSuppress: false, + }, + "old version is the same with the new version": { + Old: "2.2.100", + New: "2.2.100", + ExpectDiffSuppress: true, + }, + "both new version and old version are empty string": { + Old: "", + New: "", + ExpectDiffSuppress: true, + }, + } + + for tn, tc := range cases { + if CloudDataprocBatchRuntimeConfigVersionDiffSuppressFunc(tc.Old, tc.New) != tc.ExpectDiffSuppress { + t.Errorf("bad: %s, %q => %q expect DiffSuppress to return %t", tn, tc.Old, tc.New, tc.ExpectDiffSuppress) + } + } +}