-
Notifications
You must be signed in to change notification settings - Fork 2.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
242 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
/* | ||
* Copyright 2019 Google LLC | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
// [START dataproc_quickstart] | ||
/* This quickstart sample walks a user through creating a Cloud Dataproc | ||
* cluster, submitting a PySpark job from Google Cloud Storage to the | ||
* cluster, reading the output of the job and deleting the cluster, all | ||
* using the Java client library. | ||
* | ||
* Usage: | ||
* mvn clean package -DskipTests | ||
* | ||
* mvn exec:java -Dexec.args="<PROJECT_ID> <REGION> <CLUSTER_NAME> <GCS_JOB_FILE_PATH>" | ||
* | ||
* You can also set these arguments in the main function instead of providing them via the CLI. | ||
*/ | ||
|
||
import com.google.api.gax.longrunning.OperationFuture; | ||
import com.google.cloud.dataproc.v1.*; | ||
import com.google.cloud.storage.Blob; | ||
import com.google.cloud.storage.Storage; | ||
import com.google.cloud.storage.StorageOptions; | ||
import com.google.protobuf.Empty; | ||
import java.io.IOException; | ||
import java.util.concurrent.CompletableFuture; | ||
import java.util.concurrent.ExecutionException; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.concurrent.TimeoutException; | ||
|
||
public class SubmitHadoopFSJob { | ||
|
||
public static Job waitForJobCompletion( | ||
JobControllerClient jobControllerClient, String projectId, String region, String jobId) { | ||
while (true) { | ||
// Poll the service periodically until the Job is in a finished state. | ||
Job jobInfo = jobControllerClient.getJob(projectId, region, jobId); | ||
switch (jobInfo.getStatus().getState()) { | ||
case DONE: | ||
case CANCELLED: | ||
case ERROR: | ||
return jobInfo; | ||
default: | ||
try { | ||
// Wait a second in between polling attempts. | ||
TimeUnit.SECONDS.sleep(1); | ||
} catch (InterruptedException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
} | ||
} | ||
|
||
public static void submitHadoopFSJob( | ||
String projectId, String region, String clusterName, String hadoopFSQuery) | ||
throws IOException, InterruptedException { | ||
String myEndpoint = String.format("%s-dataproc.googleapis.com:443", region); | ||
|
||
// Configure the settings for the job controller client. | ||
JobControllerSettings jobControllerSettings = | ||
JobControllerSettings.newBuilder().setEndpoint(myEndpoint).build(); | ||
|
||
// Create both a cluster controller client and job controller client with the | ||
// configured settings. The client only needs to be created once and can be reused for | ||
// multiple requests. Using a try-with-resources closes the client, but this can also be done | ||
// manually with the .close() method. | ||
try (JobControllerClient jobControllerClient = | ||
JobControllerClient.create(jobControllerSettings)) { | ||
|
||
// Configure cluster placement for the job. | ||
JobPlacement jobPlacement = JobPlacement.newBuilder().setClusterName(clusterName).build(); | ||
|
||
// Configure Hadoop job settings. The HadoopFS query is set here. | ||
HadoopJob hadoopJob = HadoopJob.newBuilder() | ||
.setMainClass("org.apache.hadoop.fs.FsShell") | ||
.addArgs(hadoopFSQuery).build(); | ||
|
||
Job job = Job.newBuilder().setPlacement(jobPlacement).setHadoopJob(hadoopJob).build(); | ||
|
||
// Submit an asynchronous request to execute the job. | ||
Job request = jobControllerClient.submitJob(projectId, region, job); | ||
String jobId = request.getReference().getJobId(); | ||
System.out.println(String.format("Submitted job \"%s\"", jobId)); | ||
|
||
// Wait for the job to finish. | ||
CompletableFuture<Job> finishedJobFuture = | ||
CompletableFuture.supplyAsync( | ||
() -> waitForJobCompletion(jobControllerClient, projectId, region, jobId)); | ||
int timeout = 10; | ||
try { | ||
Job jobInfo = finishedJobFuture.get(timeout, TimeUnit.MINUTES); | ||
System.out.println(String.format("Job %s finished successfully.", jobId)); | ||
|
||
// Cloud Dataproc job output gets saved to a GCS bucket allocated to it. | ||
Cluster clusterInfo = clusterControllerClient.getCluster(projectId, region, clusterName); | ||
Storage storage = StorageOptions.getDefaultInstance().getService(); | ||
Blob blob = | ||
storage.get( | ||
clusterInfo.getConfig().getConfigBucket(), | ||
String.format( | ||
"google-cloud-dataproc-metainfo/%s/jobs/%s/driveroutput.000000000", | ||
clusterInfo.getClusterUuid(), jobId)); | ||
System.out.println( | ||
String.format( | ||
"Job \"%s\" finished with state %s:%n%s", | ||
jobId, jobInfo.getStatus().getState(), new String(blob.getContent()))); | ||
} catch (TimeoutException e) { | ||
System.err.println( | ||
String.format("Job timed out after %d minutes: %s", timeout, e.getMessage())); | ||
} | ||
|
||
// Delete the cluster. | ||
OperationFuture<Empty, ClusterOperationMetadata> deleteClusterAsyncRequest = | ||
clusterControllerClient.deleteClusterAsync(projectId, region, clusterName); | ||
deleteClusterAsyncRequest.get(); | ||
System.out.println(String.format("Cluster \"%s\" successfully deleted.", clusterName)); | ||
|
||
} catch (ExecutionException e) { | ||
System.err.println(String.format("Error executing quickstart: %s ", e.getMessage())); | ||
} | ||
} | ||
|
||
public static void main(String... args) throws IOException, InterruptedException { | ||
if (args.length != 4) { | ||
System.err.println( | ||
"Insufficient number of parameters provided. Please make sure a " | ||
+ "PROJECT_ID, REGION, CLUSTER_NAME and JOB_FILE_PATH are provided, in this order."); | ||
return; | ||
} | ||
|
||
String projectId = args[0]; // project-id of project to create the cluster in | ||
String region = args[1]; // region to create the cluster | ||
String clusterName = args[2]; // name of the cluster | ||
String jobFilePath = args[3]; // location in GCS of the PySpark job | ||
|
||
quickstart(projectId, region, clusterName, jobFilePath); | ||
} | ||
} | ||
// [END dataproc_quickstart] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
/* | ||
* Copyright 2019 Google LLC | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
import com.google.api.gax.longrunning.OperationFuture; | ||
import com.google.cloud.dataproc.v1.ClusterControllerClient; | ||
import com.google.cloud.dataproc.v1.ClusterControllerSettings; | ||
import com.google.cloud.dataproc.v1.ClusterOperationMetadata; | ||
import com.google.protobuf.Empty; | ||
import org.hamcrest.CoreMatchers; | ||
import org.junit.After; | ||
import org.junit.Before; | ||
import org.junit.BeforeClass; | ||
import org.junit.Test; | ||
import org.junit.runner.RunWith; | ||
import org.junit.runners.JUnit4; | ||
|
||
import java.io.ByteArrayOutputStream; | ||
import java.io.IOException; | ||
import java.io.PrintStream; | ||
import java.util.UUID; | ||
import java.util.concurrent.ExecutionException; | ||
|
||
import static junit.framework.TestCase.assertNotNull; | ||
import static org.hamcrest.MatcherAssert.assertThat; | ||
|
||
@RunWith(JUnit4.class) | ||
public class SubmitHadoopFSJobTest { | ||
|
||
private static final String CLUSTER_NAME = | ||
String.format("java-cc-test--%s", UUID.randomUUID().toString()); | ||
private static final String REGION = "us-east1"; | ||
private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); | ||
|
||
private ByteArrayOutputStream bout; | ||
|
||
private static void requireEnv(String varName) { | ||
assertNotNull( | ||
String.format("Environment variable '%s' is required to perform these tests.", varName), | ||
System.getenv(varName)); | ||
} | ||
|
||
@BeforeClass | ||
public static void checkRequirements() { | ||
requireEnv("GOOGLE_APPLICATION_CREDENTIALS"); | ||
requireEnv("GOOGLE_CLOUD_PROJECT"); | ||
} | ||
|
||
@Before | ||
public void setUp() { | ||
bout = new ByteArrayOutputStream(); | ||
System.setOut(new PrintStream(bout)); | ||
//System.setErr(new PrintStream(bout)); | ||
} | ||
|
||
@Test | ||
public void createClusterTest() throws IOException, InterruptedException { | ||
CreateCluster.createCluster(PROJECT_ID, REGION, CLUSTER_NAME); | ||
String output = bout.toString(); | ||
|
||
System.out.println(output); | ||
assertThat(output, CoreMatchers.containsString(CLUSTER_NAME)); | ||
} | ||
|
||
@After | ||
public void tearDown() throws IOException, InterruptedException, ExecutionException { | ||
String myEndpoint = String.format("%s-dataproc.googleapis.com:443", REGION); | ||
|
||
ClusterControllerSettings clusterControllerSettings = | ||
ClusterControllerSettings.newBuilder().setEndpoint(myEndpoint).build(); | ||
|
||
try (ClusterControllerClient clusterControllerClient = | ||
ClusterControllerClient.create(clusterControllerSettings)) { | ||
OperationFuture<Empty, ClusterOperationMetadata> deleteClusterAsyncRequest = | ||
clusterControllerClient.deleteClusterAsync(PROJECT_ID, REGION, CLUSTER_NAME); | ||
deleteClusterAsyncRequest.get(); | ||
} | ||
} | ||
} |