diff --git a/mmv1/products/dataproc/Batch.yaml b/mmv1/products/dataproc/Batch.yaml index b6835db7cb9e..7ee3bca75e97 100644 --- a/mmv1/products/dataproc/Batch.yaml +++ b/mmv1/products/dataproc/Batch.yaml @@ -51,6 +51,24 @@ examples: prevent_destroy: 'false' ignore_read_extra: - 'runtime_config.0.properties' + - !ruby/object:Provider::Terraform::Examples + name: 'dataproc_batch_spark_full' + primary_resource_id: 'example_batch_spark' + primary_resource_name: + 'fmt.Sprintf("tf-test-spark-batch%s", context["random_suffix"])' + test_env_vars: + project_name: :PROJECT_NAME + vars: + network_name: 'default' + prevent_destroy: 'true' + key_name: 'example-key' + keyring_name: 'example-keyring' + bucket_name: 'dataproc-bucket' + test_vars_overrides: + network_name: 'acctest.BootstrapNetWithFirewallForDataprocBatches(t, "dataproc-spark-test-network", "dataproc-spark-test-subnetwork")' + prevent_destroy: 'false' + ignore_read_extra: + - 'runtime_config.0.properties' - !ruby/object:Provider::Terraform::Examples name: 'dataproc_batch_sparksql' primary_resource_id: 'example_batch_sparsql' @@ -304,12 +322,26 @@ properties: The Cloud KMS key to use for encryption. - !ruby/object:Api::Type::String name: 'idleTtl' + default_from_api: true description: | Applies to sessions only. The duration to keep the session alive while it's idling. + Exceeding this threshold causes the session to terminate. This field cannot be set on a batch workload. + Minimum value is 10 minutes; maximum value is 14 days (see JSON representation of Duration). + Defaults to 1 hour if not set. If both ttl and idleTtl are specified for an interactive session, + the conditions are treated as OR conditions: the workload will be terminated when it has been idle + for idleTtl or when ttl has been exceeded, whichever occurs first. - !ruby/object:Api::Type::String name: 'ttl' + default_from_api: true description: | - The duration after which the workload will be terminated. + The duration after which the workload will be terminated, specified as the JSON representation for Duration. + When the workload exceeds this duration, it will be unconditionally terminated without waiting for ongoing + work to finish. If ttl is not specified for a batch workload, the workload will be allowed to run until it + exits naturally (or run forever without exiting). If ttl is not specified for an interactive session, + it defaults to 24 hours. If ttl is not specified for a batch that uses 2.1+ runtime version, it defaults to 4 hours. + Minimum value is 10 minutes; maximum value is 14 days. If both ttl and idleTtl are specified (for an interactive session), + the conditions are treated as OR conditions: the workload will be terminated when it has been idle for idleTtl or + when ttl has been exceeded, whichever occurs first. - !ruby/object:Api::Type::String name: 'stagingBucket' description: | diff --git a/mmv1/templates/terraform/examples/dataproc_batch_pyspark.tf.erb b/mmv1/templates/terraform/examples/dataproc_batch_pyspark.tf.erb index 05143a7bae76..893ee0bcf54c 100644 --- a/mmv1/templates/terraform/examples/dataproc_batch_pyspark.tf.erb +++ b/mmv1/templates/terraform/examples/dataproc_batch_pyspark.tf.erb @@ -15,6 +15,11 @@ resource "google_dataproc_batch" "<%= ctx[:primary_resource_id] %>" { pyspark_batch { main_python_file_uri = "gs://dataproc-examples/pyspark/hello-world/hello-world.py" + archive_uris = ["archive-uri-1", "archive-uri-2"] + args = ["10"] + file_uris = ["file-uri-1", "file-uri-2"] + jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"] + python_file_uris = ["pipelineparam--common_utils_py_fqn"] } } diff --git a/mmv1/templates/terraform/examples/dataproc_batch_spark_full.tf.erb b/mmv1/templates/terraform/examples/dataproc_batch_spark_full.tf.erb new file mode 100644 index 000000000000..62bae8ec5cdb --- /dev/null +++ b/mmv1/templates/terraform/examples/dataproc_batch_spark_full.tf.erb @@ -0,0 +1,128 @@ +data "google_project" "test_project" { +} + +resource "google_dataproc_batch" "<%= ctx[:primary_resource_id] %>" { + batch_id = "<%= ctx[:vars]['dataproc_batch'] %>" + location = "us-central1" + labels = {"batch_test": "terraform"} + + runtime_config { + properties = { "spark.dynamicAllocation.enabled": "false", "spark.executor.instances": "2" } + container_image = "gcr.io/my-project-id/my-spark-image:latest" + repository_config { + pypi_repository_config { + pypi_repository = "PYPI" + } + } + version = "2.2" + } + + environment_config { + execution_config { + ttl = "3600s" + idle_ttl = "3600s" + network_tags = ["tag1"] + kms_key = google_kms_crypto_key.crypto_key.id + network_uri = "<%= ctx[:vars]['network_name'] %>" + service_account = "${data.google_project.test_project.number}-compute@developer.gserviceaccount.com" + staging_bucket = google_storage_bucket.bucket.name + } + peripherals_config { + metastore_service = google_dataproc_metastore_service.ms.name + spark_history_server_config { + dataproc_cluster = google_dataproc_cluster.basic.id + } + } + } + + spark_batch { + main_class = "org.apache.spark.examples.SparkPi" + args = [ + "wordcount", + "file:///usr/lib/spark/NOTICE", + "gs://${google_dataproc_cluster.basic.cluster_config[0].bucket}/hadoopjob_output_%s", + ] + jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"] + archive_uris = ["archive-uri-1", "archive-uri-2"] + file_uris = ["file-uri-1", "file-uri-2"] + main_jar_file_uri = "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar" + } + + depends_on = [ + google_kms_crypto_key_iam_member.crypto_key_member_1, + google_kms_crypto_key_iam_member.crypto_key_member_2, + ] +} + +resource "google_storage_bucket" "bucket" { + name = "<%= ctx[:vars]['bucket_name'] %>" + location = "US" +} + +resource "google_kms_crypto_key" "crypto_key" { + name = "<%= ctx[:vars]['key_name'] %>" + key_ring = google_kms_key_ring.key_ring.id + + purpose = "ENCRYPT_DECRYPT" +} + +resource "google_kms_key_ring" "key_ring" { + name = "<%= ctx[:vars]['keyring_name'] %>" + location = "us-central1" +} + +resource "google_kms_crypto_key_iam_member" "crypto_key_member_1" { + crypto_key_id = google_kms_crypto_key.crypto_key.id + role = "roles/cloudkms.cryptoKeyEncrypterDecrypter" + + member = "serviceAccount:service-${data.google_project.project.number}@gcp-sa-metastore.iam.gserviceaccount.com" +} + +resource "google_kms_crypto_key_iam_member" "crypto_key_member_2" { + crypto_key_id = google_kms_crypto_key.crypto_key.id + role = "roles/cloudkms.cryptoKeyEncrypterDecrypter" + + member = "serviceAccount:${data.google_storage_project_service_account.gcs_account.email_address}" +} + +resource "google_dataproc_cluster" "basic" { + name = "<%= ctx[:vars]['dataproc_batch'] %>" + region = "us-central1" + + cluster_config { + # Keep the costs down with smallest config we can get away with + software_config { + override_properties = { + "dataproc:dataproc.allow.zero.workers" = "true" + } + } + + master_config { + num_instances = 1 + machine_type = "e2-standard-2" + disk_config { + boot_disk_size_gb = 35 + } + } + + metastore_config { + dataproc_metastore_service = google_dataproc_metastore_service.ms.name + } + } + } + + resource "google_dataproc_metastore_service" "ms" { + service_id = "<%= ctx[:vars]['dataproc_batch'] %>" + location = "us-central1" + port = 9080 + tier = "DEVELOPER" + + maintenance_window { + hour_of_day = 2 + day_of_week = "SUNDAY" + } + + hive_metastore_config { + version = "3.1.2" + } +} \ No newline at end of file diff --git a/mmv1/templates/terraform/examples/dataproc_batch_sparkr.tf.erb b/mmv1/templates/terraform/examples/dataproc_batch_sparkr.tf.erb index fc47d5092323..5ff9ee21e458 100644 --- a/mmv1/templates/terraform/examples/dataproc_batch_sparkr.tf.erb +++ b/mmv1/templates/terraform/examples/dataproc_batch_sparkr.tf.erb @@ -19,6 +19,8 @@ resource "google_dataproc_batch" "<%= ctx[:primary_resource_id] %>" { spark_r_batch { main_r_file_uri = "https://storage.googleapis.com/terraform-batches/spark-r-flights.r" args = ["https://storage.googleapis.com/terraform-batches/flights.csv"] + archive_uris = ["archive-uri-1", "archive-uri-2"] + file_uris = ["file-uri-1", "file-uri-2"] } } diff --git a/mmv1/templates/terraform/examples/dataproc_batch_sparksql.tf.erb b/mmv1/templates/terraform/examples/dataproc_batch_sparksql.tf.erb index f6251af054e7..292396a5e7f7 100644 --- a/mmv1/templates/terraform/examples/dataproc_batch_sparksql.tf.erb +++ b/mmv1/templates/terraform/examples/dataproc_batch_sparksql.tf.erb @@ -15,6 +15,7 @@ resource "google_dataproc_batch" "<%= ctx[:primary_resource_id] %>" { spark_sql_batch { query_file_uri = "gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql" + jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"] } } diff --git a/mmv1/third_party/terraform/acctest/bootstrap_test_utils.go b/mmv1/third_party/terraform/acctest/bootstrap_test_utils.go index f74b172e60b5..e2da1fc949de 100644 --- a/mmv1/third_party/terraform/acctest/bootstrap_test_utils.go +++ b/mmv1/third_party/terraform/acctest/bootstrap_test_utils.go @@ -929,6 +929,12 @@ func BootstrapSubnetWithFirewallForDataprocBatches(t *testing.T, testId string, return subnetworkName } +func BootstrapNetWithFirewallForDataprocBatches(t *testing.T, testId string, subnetName string) string { + networkName := BootstrapSharedTestNetwork(t, testId) + BootstrapFirewallForDataprocSharedNetwork(t, subnetName, networkName) + return networkName +} + func BootstrapSubnetWithOverrides(t *testing.T, subnetName string, networkName string, subnetOptions map[string]interface{}) string { projectID := envvar.GetTestProjectFromEnv() region := envvar.GetTestRegionFromEnv()