From 6672fee8c94fa93b7220aaeed4424da985795c0a Mon Sep 17 00:00:00 2001 From: Zhenhua Li Date: Wed, 25 Sep 2024 15:40:51 -0700 Subject: [PATCH] Convert from Ruby to Go Fix tests Address comments Rename resource_dataproc_batch_test.go to resource_dataproc_batch_test.go.erb Fixed package name & switched to func Fixed imports & test file name Fixed function names Fix PR review comments Fixed yaml lint errors Add a test for SparkR batch Resolve version and properties permadiff for batch resource Add more fields to test examples Resolve peripheral_configs permadiff. Making fields exatly_one_off/ Bootstrap new subnet with firewall for Dataproc Batches Added utility method to bootstrap subnet for Dataproc batches Added utility method to bootstrap subnet for Dataproc batches Updated async block for batch resource. Terraform Support for Dataproc Batches. --- mmv1/products/dataproc/Batch.yaml | 540 ++++++++++++++++++ mmv1/products/dataproc/product.yaml | 3 +- .../constants/cloud_dataproc_batch.go | 15 + .../decoders/cloud_dataproc_batch.go.tmpl | 37 ++ .../examples/dataproc_batch_pyspark.tf.tmpl | 28 + .../examples/dataproc_batch_spark.tf.tmpl | 25 + .../dataproc_batch_spark_full.tf.tmpl | 127 ++++ .../examples/dataproc_batch_sparkr.tf.tmpl | 24 + .../examples/dataproc_batch_sparksql.tf.tmpl | 24 + .../terraform/acctest/bootstrap_test_utils.go | 33 +- .../dataproc/resource_dataproc_batch_test.go | 59 ++ 11 files changed, 911 insertions(+), 4 deletions(-) create mode 100644 mmv1/products/dataproc/Batch.yaml create mode 100644 mmv1/templates/terraform/constants/cloud_dataproc_batch.go create mode 100644 mmv1/templates/terraform/decoders/cloud_dataproc_batch.go.tmpl create mode 100644 mmv1/templates/terraform/examples/dataproc_batch_pyspark.tf.tmpl create mode 100644 mmv1/templates/terraform/examples/dataproc_batch_spark.tf.tmpl create mode 100644 mmv1/templates/terraform/examples/dataproc_batch_spark_full.tf.tmpl create mode 100644 mmv1/templates/terraform/examples/dataproc_batch_sparkr.tf.tmpl create mode 100644 mmv1/templates/terraform/examples/dataproc_batch_sparksql.tf.tmpl create mode 100644 mmv1/third_party/terraform/services/dataproc/resource_dataproc_batch_test.go diff --git a/mmv1/products/dataproc/Batch.yaml b/mmv1/products/dataproc/Batch.yaml new file mode 100644 index 000000000000..d994225a2c47 --- /dev/null +++ b/mmv1/products/dataproc/Batch.yaml @@ -0,0 +1,540 @@ +# Copyright 2024 Google Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +name: 'Batch' +description: | + Dataproc Serverless Batches lets you run Spark workloads without requiring you to + provision and manage your own Dataproc cluster. +references: + guides: + 'Dataproc Serverless Batches Intro': 'https://cloud.google.com/dataproc-serverless/docs/overview' + api: 'https://cloud.google.com/dataproc-serverless/docs/reference/rest/v1/projects.locations.batches' +docs: +id_format: 'projects/{{project}}/locations/{{location}}/batches/{{batch_id}}' +base_url: 'projects/{{project}}/locations/{{location}}/batches' +self_link: 'projects/{{project}}/locations/{{location}}/batches/{{batch_id}}' +create_url: 'projects/{{project}}/locations/{{location}}/batches?batchId={{batch_id}}' +delete_url: 'projects/{{project}}/locations/{{location}}/batches/{{batch_id}}' +immutable: true +import_format: + - 'projects/{{project}}/locations/{{location}}/batches/{{batch_id}}' +timeouts: + insert_minutes: 10 + update_minutes: 20 + delete_minutes: 5 +autogen_async: true +async: + actions: ['create', 'delete', 'update'] + type: 'OpAsync' + operation: + base_url: '{{op_id}}' + result: + resource_inside_response: false +collection_url_key: 'batches' +custom_code: + constants: 'templates/terraform/constants/cloud_dataproc_batch.go.tmpl' + decoder: 'templates/terraform/decoders/cloud_dataproc_batch.go.tmpl' +examples: + - name: 'dataproc_batch_spark' + primary_resource_id: 'example_batch_spark' + primary_resource_name: 'fmt.Sprintf("tf-test-spark-batch%s", context["random_suffix"])' + vars: + subnetwork_name: 'default' + prevent_destroy: 'true' + test_env_vars: + project_name: 'PROJECT_NAME' + test_vars_overrides: + 'subnetwork_name': 'acctest.BootstrapSubnetWithFirewallForDataprocBatches(t, "dataproc-spark-test-network", "dataproc-spark-test-subnetwork")' + 'prevent_destroy': 'false' + ignore_read_extra: + - 'runtime_config.0.properties' + - name: 'dataproc_batch_spark_full' + primary_resource_id: 'example_batch_spark' + primary_resource_name: 'fmt.Sprintf("tf-test-spark-batch%s", context["random_suffix"])' + vars: + dataproc_batch: 'dataproc-batch' + network_name: 'default' + prevent_destroy: 'true' + key_name: 'example-key' + keyring_name: 'example-keyring' + bucket_name: 'dataproc-bucket' + test_env_vars: + project_name: 'PROJECT_NAME' + test_vars_overrides: + 'prevent_destroy': 'false' + ignore_read_extra: + - 'runtime_config.0.properties' + - name: 'dataproc_batch_sparksql' + primary_resource_id: 'example_batch_sparsql' + primary_resource_name: 'fmt.Sprintf("tf-test-spark-batch%s", context["random_suffix"])' + vars: + subnetwork_name: 'default' + prevent_destroy: 'true' + test_env_vars: + project_name: 'PROJECT_NAME' + test_vars_overrides: + 'subnetwork_name': 'acctest.BootstrapSubnetWithFirewallForDataprocBatches(t, "dataproc-sparksql-test-network", "dataproc-sparksql-test-subnetwork")' + 'prevent_destroy': 'false' + ignore_read_extra: + - 'runtime_config.0.properties' + - name: 'dataproc_batch_pyspark' + primary_resource_id: 'example_batch_pyspark' + primary_resource_name: 'fmt.Sprintf("tf-test-spark-batch%s", context["random_suffix"])' + vars: + subnetwork_name: 'default' + prevent_destroy: 'true' + test_env_vars: + project_name: 'PROJECT_NAME' + test_vars_overrides: + 'subnetwork_name': 'acctest.BootstrapSubnetWithFirewallForDataprocBatches(t, "dataproc-pyspark-test-network", "dataproc-pyspark-test-subnetwork")' + 'prevent_destroy': 'false' + ignore_read_extra: + - 'runtime_config.0.properties' + - name: 'dataproc_batch_sparkr' + primary_resource_id: 'example_batch_sparkr' + primary_resource_name: 'fmt.Sprintf("tf-test-spark-batch%s", context["random_suffix"])' + vars: + subnetwork_name: 'default' + prevent_destroy: 'true' + test_env_vars: + project_name: 'PROJECT_NAME' + test_vars_overrides: + 'subnetwork_name': 'acctest.BootstrapSubnetWithFirewallForDataprocBatches(t, "dataproc-pyspark-test-network", "dataproc-pyspark-test-subnetwork")' + 'prevent_destroy': 'false' + ignore_read_extra: + - 'runtime_config.0.properties' +parameters: + - name: 'location' + type: String + description: | + The location in which the batch will be created in. + url_param_only: true + immutable: true + - name: 'batchId' + type: String + description: | + The ID to use for the batch, which will become the final component of the batch's resource name. + This value must be 4-63 characters. Valid characters are /[a-z][0-9]-/. + url_param_only: true + immutable: true +properties: + - name: 'name' + type: String + description: | + The resource name of the batch. + output: true + - name: 'uuid' + type: String + description: | + A batch UUID (Unique Universal Identifier). The service generates this value when it creates the batch. + output: true + - name: 'createTime' + type: String + description: | + The time when the batch was created. + output: true + - name: 'runtimeInfo' + type: NestedObject + description: 'Runtime information about batch execution.' + output: true + properties: + - name: 'outputUri' + type: String + description: | + A URI pointing to the location of the stdout and stderr of the workload. + output: true + - name: 'diagnosticOutputUri' + type: String + description: | + A URI pointing to the location of the diagnostics tarball. + output: true + - name: 'endpoints' + type: KeyValuePairs + description: | + Map of remote access endpoints (such as web interfaces and APIs) to their URIs. + output: true + - name: 'approximateUsage' + type: NestedObject + description: | + Approximate workload resource usage, calculated when the workload completes(see [Dataproc Serverless pricing](https://cloud.google.com/dataproc-serverless/pricing)) + output: true + properties: + - name: 'milliDcuSeconds' + type: String + description: | + DCU (Dataproc Compute Units) usage in (milliDCU x seconds) + output: true + - name: 'shuffleStorageGbSeconds' + type: String + description: | + Shuffle storage usage in (GB x seconds) + output: true + - name: 'milliAcceleratorSeconds' + type: String + description: | + Accelerator usage in (milliAccelerator x seconds) + output: true + - name: 'acceleratorType' + type: String + description: | + Accelerator type being used, if any + output: true + - name: 'currentUsage' + type: NestedObject + description: | + Snapshot of current workload resource usage(see [Dataproc Serverless pricing](https://cloud.google.com/dataproc-serverless/pricing)) + output: true + properties: + - name: 'milliDcu' + type: String + description: | + Milli (one-thousandth) Dataproc Compute Units (DCUs). + output: true + - name: 'shuffleStorageGb' + type: String + description: | + Shuffle Storage in gigabytes (GB). + output: true + - name: 'milliDcuPremium' + type: String + description: | + Milli (one-thousandth) Dataproc Compute Units (DCUs) charged at premium tier. + output: true + - name: 'shuffleStorageGbPremium' + type: String + description: | + Shuffle Storage in gigabytes (GB) charged at premium tier. + output: true + - name: 'milliAccelerator' + type: String + description: | + Milli (one-thousandth) accelerator.. + output: true + - name: 'acceleratorType' + type: String + description: | + Accelerator type being used, if any. + output: true + - name: 'snapshotTime' + type: String + description: | + The timestamp of the usage snapshot. + output: true + - name: 'state' + type: String + description: | + The state of the batch. For possible values, see the [API documentation](https://cloud.google.com/dataproc-serverless/docs/reference/rest/v1/projects.locations.batches#State). + output: true + - name: 'stateMessage' + type: String + description: | + Batch state details, such as a failure description if the state is FAILED. + output: true + - name: 'stateTime' + type: String + description: | + Batch state details, such as a failure description if the state is FAILED. + output: true + - name: 'creator' + type: String + description: | + The email address of the user who created the batch. + output: true + - name: 'labels' + type: KeyValueLabels + description: | + The labels to associate with this batch. + - name: 'runtimeConfig' + type: NestedObject + description: | + Runtime configuration for the batch execution. + properties: + - name: 'version' + type: String + description: | + Version of the batch runtime. + default_from_api: true + diff_suppress_func: 'CloudDataprocBatchRuntimeConfigVersionDiffSuppress' + - name: 'containerImage' + type: String + description: | + Optional custom container image for the job runtime environment. If not specified, a default container image will be used. + - name: 'properties' + type: KeyValuePairs + description: | + A mapping of property names to values, which are used to configure workload execution. + - name: 'effective_properties' + type: KeyValuePairs + description: | + A mapping of property names to values, which are used to configure workload execution. + output: true + - name: 'environmentConfig' + type: NestedObject + description: | + Environment configuration for the batch execution. + properties: + - name: 'executionConfig' + type: NestedObject + description: | + Execution configuration for a workload. + properties: + - name: 'serviceAccount' + type: String + description: | + Service account that used to execute workload. + default_from_api: true + - name: 'networkTags' + type: Array + description: | + Tags used for network traffic control. + item_type: + type: String + - name: 'kmsKey' + type: String + description: | + The Cloud KMS key to use for encryption. + - name: 'ttl' + type: String + description: | + The duration after which the workload will be terminated. + When the workload exceeds this duration, it will be unconditionally terminated without waiting for ongoing + work to finish. If ttl is not specified for a batch workload, the workload will be allowed to run until it + exits naturally (or run forever without exiting). If ttl is not specified for an interactive session, + it defaults to 24 hours. If ttl is not specified for a batch that uses 2.1+ runtime version, it defaults to 4 hours. + Minimum value is 10 minutes; maximum value is 14 days. If both ttl and idleTtl are specified (for an interactive session), + the conditions are treated as OR conditions: the workload will be terminated when it has been idle for idleTtl or + when ttl has been exceeded, whichever occurs first. + default_from_api: true + - name: 'stagingBucket' + type: String + description: | + A Cloud Storage bucket used to stage workload dependencies, config files, and store + workload output and other ephemeral data, such as Spark history files. If you do not specify a staging bucket, + Cloud Dataproc will determine a Cloud Storage location according to the region where your workload is running, + and then create and manage project-level, per-location staging and temporary buckets. + This field requires a Cloud Storage bucket name, not a gs://... URI to a Cloud Storage bucket. + - name: 'networkUri' + type: String + description: | + Network configuration for workload execution. + conflicts: + - environment_config.0.execution_config.0.subnetwork_uri + - name: 'subnetworkUri' + type: String + description: | + Subnetwork configuration for workload execution. + conflicts: + - environment_config.0.execution_config.0.network_uri + - name: 'peripheralsConfig' + type: NestedObject + description: | + Peripherals configuration that workload has access to. + default_from_api: true + allow_empty_object: true + properties: + - name: 'metastoreService' + type: String + description: | + Resource name of an existing Dataproc Metastore service. + - name: 'sparkHistoryServerConfig' + type: NestedObject + description: | + The Spark History Server configuration for the workload. + properties: + - name: 'dataprocCluster' + type: String + description: | + Resource name of an existing Dataproc Cluster to act as a Spark History Server for the workload. + - name: 'operation' + type: String + description: | + The resource name of the operation associated with this batch. + output: true + - name: 'stateHistory' + type: Array + description: | + Historical state information for the batch. + output: true + item_type: + type: NestedObject + properties: + - name: 'state' + type: String + description: | + The state of the batch at this point in history. For possible values, see the [API documentation](https://cloud.google.com/dataproc-serverless/docs/reference/rest/v1/projects.locations.batches#State). + output: true + - name: 'stateMessage' + type: String + description: | + Details about the state at this point in history. + output: true + - name: 'stateStartTime' + type: String + description: | + The time when the batch entered the historical state. + output: true + - name: 'pysparkBatch' + type: NestedObject + description: | + PySpark batch config. + exactly_one_of: + - 'pyspark_batch' + - 'spark_batch' + - 'spark_sql_batch' + - 'spark_r_batch' + properties: + - name: 'mainPythonFileUri' + type: String + description: | + The HCFS URI of the main Python file to use as the Spark driver. Must be a .py file. + - name: 'args' + type: Array + description: | + The arguments to pass to the driver. Do not include arguments that can be set as batch + properties, such as --conf, since a collision can occur that causes an incorrect batch submission. + item_type: + type: String + - name: 'pythonFileUris' + type: Array + description: | + HCFS file URIs of Python files to pass to the PySpark framework. + Supported file types: .py, .egg, and .zip. + item_type: + type: String + - name: 'jarFileUris' + type: Array + description: | + HCFS URIs of jar files to add to the classpath of the Spark driver and tasks. + item_type: + type: String + - name: 'fileUris' + type: Array + description: | + HCFS URIs of files to be placed in the working directory of each executor. + item_type: + type: String + - name: 'archiveUris' + type: Array + description: | + HCFS URIs of archives to be extracted into the working directory of each executor. + Supported file types: .jar, .tar, .tar.gz, .tgz, and .zip. + item_type: + type: String + - name: 'sparkBatch' + type: NestedObject + description: | + Spark batch config. + exactly_one_of: + - 'pyspark_batch' + - 'spark_batch' + - 'spark_sql_batch' + - 'spark_r_batch' + properties: + - name: 'args' + type: Array + description: | + The arguments to pass to the driver. Do not include arguments that can be set as batch + properties, such as --conf, since a collision can occur that causes an incorrect batch submission. + item_type: + type: String + - name: 'jarFileUris' + type: Array + description: | + HCFS URIs of jar files to add to the classpath of the Spark driver and tasks. + item_type: + type: String + - name: 'fileUris' + type: Array + description: | + HCFS URIs of files to be placed in the working directory of each executor. + item_type: + type: String + - name: 'archiveUris' + type: Array + description: | + HCFS URIs of archives to be extracted into the working directory of each executor. + Supported file types: .jar, .tar, .tar.gz, .tgz, and .zip. + item_type: + type: String + - name: 'mainJarFileUri' + type: String + description: | + The HCFS URI of the jar file that contains the main class. + exactly_one_of: + - 'spark_batch.0.main_class' + - name: 'mainClass' + type: String + description: | + The name of the driver main class. The jar file that contains the class must be in the + classpath or specified in jarFileUris. + exactly_one_of: + - 'spark_batch.0.main_jar_file_uri' + - name: 'sparkRBatch' + type: NestedObject + description: | + SparkR batch config. + exactly_one_of: + - 'pyspark_batch' + - 'spark_batch' + - 'spark_sql_batch' + - 'spark_r_batch' + properties: + - name: 'mainRFileUri' + type: String + description: | + The HCFS URI of the main R file to use as the driver. Must be a .R or .r file. + - name: 'args' + type: Array + description: | + The arguments to pass to the driver. Do not include arguments that can be set as batch + properties, such as --conf, since a collision can occur that causes an incorrect batch submission. + item_type: + type: String + - name: 'fileUris' + type: Array + description: | + HCFS URIs of files to be placed in the working directory of each executor. + item_type: + type: String + - name: 'archiveUris' + type: Array + description: | + HCFS URIs of archives to be extracted into the working directory of each executor. + Supported file types: .jar, .tar, .tar.gz, .tgz, and .zip. + item_type: + type: String + - name: 'sparkSqlBatch' + type: NestedObject + description: | + Spark SQL batch config. + exactly_one_of: + - 'pyspark_batch' + - 'spark_batch' + - 'spark_sql_batch' + - 'spark_r_batch' + properties: + - name: 'queryFileUri' + type: String + description: | + The HCFS URI of the script that contains Spark SQL queries to execute. + - name: 'jarFileUris' + type: Array + description: | + HCFS URIs of jar files to be added to the Spark CLASSPATH. + item_type: + type: String + - name: 'queryVariables' + type: KeyValuePairs + description: | + Mapping of query variable names to values (equivalent to the Spark SQL command: SET name="value";). diff --git a/mmv1/products/dataproc/product.yaml b/mmv1/products/dataproc/product.yaml index dc18983cd0db..d6cd5271db18 100644 --- a/mmv1/products/dataproc/product.yaml +++ b/mmv1/products/dataproc/product.yaml @@ -11,13 +11,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Warning: This is a temporary file, and should not be edited directly --- name: 'Dataproc' display_name: 'Dataproc' versions: - name: 'ga' base_url: 'https://dataproc.googleapis.com/v1/' - - name: 'beta' - base_url: 'https://dataproc.googleapis.com/v1beta2/' scopes: - 'https://www.googleapis.com/auth/cloud-identity' diff --git a/mmv1/templates/terraform/constants/cloud_dataproc_batch.go b/mmv1/templates/terraform/constants/cloud_dataproc_batch.go new file mode 100644 index 000000000000..bb6cff9d836f --- /dev/null +++ b/mmv1/templates/terraform/constants/cloud_dataproc_batch.go @@ -0,0 +1,15 @@ +/* + * Dataproc Batch api apends subminor version to the provided + * version. We are suppressing this server generated subminor. + */ +func CloudDataprocBatchRuntimeConfigVersionDiffSuppressFunc(old, new string) bool { + if old != "" && strings.HasPrefix(new, old) || (new != "" && strings.HasPrefix(old, new)) { + return true + } + + return old == new +} + +func CloudDataprocBatchRuntimeConfigVersionDiffSuppress(_, old, new string, d *schema.ResourceData) bool { + return CloudDataprocBatchRuntimeConfigVersionDiffSuppressFunc(old, new) +} diff --git a/mmv1/templates/terraform/decoders/cloud_dataproc_batch.go.tmpl b/mmv1/templates/terraform/decoders/cloud_dataproc_batch.go.tmpl new file mode 100644 index 000000000000..e256d59ffd7e --- /dev/null +++ b/mmv1/templates/terraform/decoders/cloud_dataproc_batch.go.tmpl @@ -0,0 +1,37 @@ +{{/* + The license inside this block applies to this file + Copyright 2024 Google Inc. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ -}} +if obj1, ok := res["runtimeConfig"]; ok { + if rconfig, ok := obj1.(map[string]interface{}); ok { + if obj2, ok := rconfig["properties"]; ok { + if properties, ok := obj2.(map[string]interface{}); ok { + // Update effective_properties to include both server set and client set properties + propertiesCopy := make(map[string]interface{}) + for k, v := range properties { + propertiesCopy[k] = v + } + rconfig["effectiveProperties"] = propertiesCopy + + // Update properties back to original client set properties + originalPropertiesCopy := make(map[string]interface{}) + originalProperties := d.Get("runtime_config.0.properties").(interface{}).(map[string]interface{}) + for k, v := range originalProperties { + originalPropertiesCopy[k] = v + } + rconfig["properties"] = originalPropertiesCopy + return res, nil + } + } + } +} + +return res, nil diff --git a/mmv1/templates/terraform/examples/dataproc_batch_pyspark.tf.tmpl b/mmv1/templates/terraform/examples/dataproc_batch_pyspark.tf.tmpl new file mode 100644 index 000000000000..6dece108007d --- /dev/null +++ b/mmv1/templates/terraform/examples/dataproc_batch_pyspark.tf.tmpl @@ -0,0 +1,28 @@ +resource "google_dataproc_batch" "{{$.PrimaryResourceId}}" { + batch_id = "tf-test-batch%{random_suffix}" + location = "us-central1" + + runtime_config { + properties = { "spark.dynamicAllocation.enabled": "false", "spark.executor.instances": "2" } + } + + environment_config { + execution_config { + subnetwork_uri = "{{index $.Vars "subnetwork_name"}}" + } + } + + pyspark_batch { + main_python_file_uri = "https://storage.googleapis.com/terraform-batches/test_util.py" + args = ["10"] + jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"] + python_file_uris = ["gs://dataproc-examples/pyspark/hello-world/hello-world.py"] + archive_uris = [ + "https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked", + "https://storage.googleapis.com/terraform-batches/animals.txt.jar", + "https://storage.googleapis.com/terraform-batches/animals.txt" + ] + file_uris = ["https://storage.googleapis.com/terraform-batches/people.txt"] + } +} + diff --git a/mmv1/templates/terraform/examples/dataproc_batch_spark.tf.tmpl b/mmv1/templates/terraform/examples/dataproc_batch_spark.tf.tmpl new file mode 100644 index 000000000000..defea3e807bf --- /dev/null +++ b/mmv1/templates/terraform/examples/dataproc_batch_spark.tf.tmpl @@ -0,0 +1,25 @@ +resource "google_dataproc_batch" "{{$.PrimaryResourceId}}" { + + batch_id = "tf-test-batch%{random_suffix}" + location = "us-central1" + labels = {"batch_test": "terraform"} + + runtime_config { + properties = { "spark.dynamicAllocation.enabled": "false", "spark.executor.instances": "2" } + } + + environment_config { + execution_config { + subnetwork_uri = "{{index $.Vars "subnetwork_name"}}" + ttl = "3600s" + network_tags = ["tag1"] + } + } + + spark_batch { + main_class = "org.apache.spark.examples.SparkPi" + args = ["10"] + jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"] + } +} + diff --git a/mmv1/templates/terraform/examples/dataproc_batch_spark_full.tf.tmpl b/mmv1/templates/terraform/examples/dataproc_batch_spark_full.tf.tmpl new file mode 100644 index 000000000000..cbc3bb182381 --- /dev/null +++ b/mmv1/templates/terraform/examples/dataproc_batch_spark_full.tf.tmpl @@ -0,0 +1,127 @@ +data "google_project" "project" { +} + +data "google_storage_project_service_account" "gcs_account" { +} + +resource "google_dataproc_batch" "{{$.PrimaryResourceId}}" { + batch_id = "{{index $.Vars "dataproc_batch"}}" + location = "us-central1" + labels = {"batch_test": "terraform"} + + runtime_config { + properties = { "spark.dynamicAllocation.enabled": "false", "spark.executor.instances": "2" } + version = "2.2" + } + + environment_config { + execution_config { + ttl = "3600s" + network_tags = ["tag1"] + kms_key = google_kms_crypto_key.crypto_key.id + network_uri = "{{index $.Vars "network_name"}}" + service_account = "${data.google_project.project.number}-compute@developer.gserviceaccount.com" + staging_bucket = google_storage_bucket.bucket.name + } + peripherals_config { + metastore_service = google_dataproc_metastore_service.ms.name + spark_history_server_config { + dataproc_cluster = google_dataproc_cluster.basic.id + } + } + } + + spark_batch { + args = [ + "wordcount", + "file:///usr/lib/spark/NOTICE", + "gs://${google_dataproc_cluster.basic.cluster_config[0].bucket}/hadoopjob_output_%s", + ] + jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"] + main_jar_file_uri = "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar" + } + + depends_on = [ + google_kms_crypto_key_iam_member.crypto_key_member_1, + google_kms_crypto_key_iam_member.crypto_key_member_2, + ] +} + +resource "google_storage_bucket" "bucket" { + uniform_bucket_level_access = true + name = "{{index $.Vars "bucket_name"}}" + location = "US" +} + +resource "google_kms_crypto_key" "crypto_key" { + name = "{{index $.Vars "key_name"}}" + key_ring = google_kms_key_ring.key_ring.id + + purpose = "ENCRYPT_DECRYPT" +} + +resource "google_kms_key_ring" "key_ring" { + name = "{{index $.Vars "keyring_name"}}" + location = "us-central1" +} + +resource "google_kms_crypto_key_iam_member" "crypto_key_member_1" { + crypto_key_id = google_kms_crypto_key.crypto_key.id + role = "roles/cloudkms.cryptoKeyEncrypterDecrypter" + + member = "serviceAccount:service-${data.google_project.project.number}@dataproc-accounts.iam.gserviceaccount.com" +} + +resource "google_kms_crypto_key_iam_member" "crypto_key_member_2" { + crypto_key_id = google_kms_crypto_key.crypto_key.id + role = "roles/cloudkms.cryptoKeyEncrypterDecrypter" + + member = "serviceAccount:${data.google_storage_project_service_account.gcs_account.email_address}" +} + +resource "google_dataproc_cluster" "basic" { + name = "{{index $.Vars "dataproc_batch"}}" + region = "us-central1" + + cluster_config { + # Keep the costs down with smallest config we can get away with + software_config { + override_properties = { + "dataproc:dataproc.allow.zero.workers" = "true" + "spark:spark.history.fs.logDirectory" = "gs://mybucket/spark-job-history" + } + } + + endpoint_config { + enable_http_port_access = true + } + + master_config { + num_instances = 1 + machine_type = "e2-standard-2" + disk_config { + boot_disk_size_gb = 35 + } + } + + metastore_config { + dataproc_metastore_service = google_dataproc_metastore_service.ms.name + } + } +} + + resource "google_dataproc_metastore_service" "ms" { + service_id = "{{index $.Vars "dataproc_batch"}}" + location = "us-central1" + port = 9080 + tier = "DEVELOPER" + + maintenance_window { + hour_of_day = 2 + day_of_week = "SUNDAY" + } + + hive_metastore_config { + version = "3.1.2" + } +} \ No newline at end of file diff --git a/mmv1/templates/terraform/examples/dataproc_batch_sparkr.tf.tmpl b/mmv1/templates/terraform/examples/dataproc_batch_sparkr.tf.tmpl new file mode 100644 index 000000000000..2234d417e9c5 --- /dev/null +++ b/mmv1/templates/terraform/examples/dataproc_batch_sparkr.tf.tmpl @@ -0,0 +1,24 @@ +resource "google_dataproc_batch" "{{$.PrimaryResourceId}}" { + + batch_id = "tf-test-batch%{random_suffix}" + location = "us-central1" + labels = {"batch_test": "terraform"} + + runtime_config { + properties = { "spark.dynamicAllocation.enabled": "false", "spark.executor.instances": "2" } + } + + environment_config { + execution_config { + subnetwork_uri = "{{index $.Vars "subnetwork_name"}}" + ttl = "3600s" + network_tags = ["tag1"] + } + } + + spark_r_batch { + main_r_file_uri = "https://storage.googleapis.com/terraform-batches/spark-r-flights.r" + args = ["https://storage.googleapis.com/terraform-batches/flights.csv"] + } +} + diff --git a/mmv1/templates/terraform/examples/dataproc_batch_sparksql.tf.tmpl b/mmv1/templates/terraform/examples/dataproc_batch_sparksql.tf.tmpl new file mode 100644 index 000000000000..e821a22377bf --- /dev/null +++ b/mmv1/templates/terraform/examples/dataproc_batch_sparksql.tf.tmpl @@ -0,0 +1,24 @@ +resource "google_dataproc_batch" "{{$.PrimaryResourceId}}" { + + batch_id = "tf-test-batch%{random_suffix}" + location = "us-central1" + + runtime_config { + properties = { "spark.dynamicAllocation.enabled": "false", "spark.executor.instances": "2" } + } + + environment_config { + execution_config { + subnetwork_uri = "{{index $.Vars "subnetwork_name"}}" + } + } + + spark_sql_batch { + query_file_uri = "gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql" + jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"] + query_variables = { + name = "value" + } + } +} + diff --git a/mmv1/third_party/terraform/acctest/bootstrap_test_utils.go b/mmv1/third_party/terraform/acctest/bootstrap_test_utils.go index 807dea324306..0010c4d0a3ec 100644 --- a/mmv1/third_party/terraform/acctest/bootstrap_test_utils.go +++ b/mmv1/third_party/terraform/acctest/bootstrap_test_utils.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "log" + "maps" "os" "strings" "testing" @@ -910,7 +911,31 @@ func BootstrapSharedCaPoolInLocation(t *testing.T, location string) string { return poolName } +func BootstrapSubnetForDataprocBatches(t *testing.T, subnetName string, networkName string) string { + subnetOptions := map[string]interface{}{ + "privateIpGoogleAccess": true, + } + return BootstrapSubnetWithOverrides(t, subnetName, networkName, subnetOptions) +} + func BootstrapSubnet(t *testing.T, subnetName string, networkName string) string { + return BootstrapSubnetWithOverrides(t, subnetName, networkName, make(map[string]interface{})) +} + +func BootstrapSubnetWithFirewallForDataprocBatches(t *testing.T, testId string, subnetName string) string { + networkName := BootstrapSharedTestNetwork(t, testId) + subnetworkName := BootstrapSubnetForDataprocBatches(t, subnetName, networkName) + BootstrapFirewallForDataprocSharedNetwork(t, subnetName, networkName) + return subnetworkName +} + +func BootstrapNetWithFirewallForDataprocBatches(t *testing.T, testId string, subnetName string) string { + networkName := BootstrapSharedTestNetwork(t, testId) + BootstrapFirewallForDataprocSharedNetwork(t, subnetName, networkName) + return networkName +} + +func BootstrapSubnetWithOverrides(t *testing.T, subnetName string, networkName string, subnetOptions map[string]interface{}) string { projectID := envvar.GetTestProjectFromEnv() region := envvar.GetTestRegionFromEnv() @@ -932,20 +957,24 @@ func BootstrapSubnet(t *testing.T, subnetName string, networkName string) string networkUrl := fmt.Sprintf("%sprojects/%s/global/networks/%s", config.ComputeBasePath, projectID, networkName) url := fmt.Sprintf("%sprojects/%s/regions/%s/subnetworks", config.ComputeBasePath, projectID, region) - subnetObj := map[string]interface{}{ + defaultSubnetObj := map[string]interface{}{ "name": subnetName, "region ": region, "network": networkUrl, "ipCidrRange": "10.77.0.0/20", } + if len(subnetOptions) != 0 { + maps.Copy(defaultSubnetObj, subnetOptions) + } + res, err := transport_tpg.SendRequest(transport_tpg.SendRequestOptions{ Config: config, Method: "POST", Project: projectID, RawURL: url, UserAgent: config.UserAgent, - Body: subnetObj, + Body: defaultSubnetObj, Timeout: 4 * time.Minute, }) diff --git a/mmv1/third_party/terraform/services/dataproc/resource_dataproc_batch_test.go b/mmv1/third_party/terraform/services/dataproc/resource_dataproc_batch_test.go new file mode 100644 index 000000000000..9d8e37f801c5 --- /dev/null +++ b/mmv1/third_party/terraform/services/dataproc/resource_dataproc_batch_test.go @@ -0,0 +1,59 @@ +package dataproc + +import ( + "testing" +) + +func TestCloudDataprocBatchRuntimeConfigVersionDiffSuppress(t *testing.T) { + cases := map[string]struct { + Old, New string + ExpectDiffSuppress bool + }{ + "old version is empty, new version has a value": { + Old: "", + New: "2.2.100", + ExpectDiffSuppress: false, + }, + "old version is the prefix of the new version": { + Old: "2.2", + New: "2.2.100", + ExpectDiffSuppress: true, + }, + "old version is not the prefix of the new version": { + Old: "2.1", + New: "2.2.100", + ExpectDiffSuppress: false, + }, + "new version is empty, old version has a value": { + Old: "2.2.100", + New: "", + ExpectDiffSuppress: false, + }, + "new version is the prefix of the old version": { + Old: "2.2.100", + New: "2.2", + ExpectDiffSuppress: true, + }, + "new version is not the prefix of the old version": { + Old: "2.2.100", + New: "2.1", + ExpectDiffSuppress: false, + }, + "old version is the same with the new version": { + Old: "2.2.100", + New: "2.2.100", + ExpectDiffSuppress: true, + }, + "both new version and old version are empty string": { + Old: "", + New: "", + ExpectDiffSuppress: true, + }, + } + + for tn, tc := range cases { + if CloudDataprocBatchRuntimeConfigVersionDiffSuppressFunc(tc.Old, tc.New) != tc.ExpectDiffSuppress { + t.Errorf("bad: %s, %q => %q expect DiffSuppress to return %t", tn, tc.Old, tc.New, tc.ExpectDiffSuppress) + } + } +}