Skip to content

Commit

Permalink
Adding HadoopFS sample
Browse files Browse the repository at this point in the history
  • Loading branch information
bradmiro committed May 20, 2020
1 parent 52c592f commit 933900b
Show file tree
Hide file tree
Showing 2 changed files with 242 additions and 0 deletions.
151 changes: 151 additions & 0 deletions dataproc/src/main/java/SubmitHadoopFSJob.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
/*
* Copyright 2019 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// [START dataproc_quickstart]
/* This quickstart sample walks a user through creating a Cloud Dataproc
* cluster, submitting a PySpark job from Google Cloud Storage to the
* cluster, reading the output of the job and deleting the cluster, all
* using the Java client library.
*
* Usage:
* mvn clean package -DskipTests
*
* mvn exec:java -Dexec.args="<PROJECT_ID> <REGION> <CLUSTER_NAME> <GCS_JOB_FILE_PATH>"
*
* You can also set these arguments in the main function instead of providing them via the CLI.
*/

import com.google.api.gax.longrunning.OperationFuture;
import com.google.cloud.dataproc.v1.*;
import com.google.cloud.storage.Blob;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageOptions;
import com.google.protobuf.Empty;
import java.io.IOException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

public class SubmitHadoopFSJob {

public static Job waitForJobCompletion(
JobControllerClient jobControllerClient, String projectId, String region, String jobId) {
while (true) {
// Poll the service periodically until the Job is in a finished state.
Job jobInfo = jobControllerClient.getJob(projectId, region, jobId);
switch (jobInfo.getStatus().getState()) {
case DONE:
case CANCELLED:
case ERROR:
return jobInfo;
default:
try {
// Wait a second in between polling attempts.
TimeUnit.SECONDS.sleep(1);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}

public static void submitHadoopFSJob(
String projectId, String region, String clusterName, String hadoopFSQuery)
throws IOException, InterruptedException {
String myEndpoint = String.format("%s-dataproc.googleapis.com:443", region);

// Configure the settings for the job controller client.
JobControllerSettings jobControllerSettings =
JobControllerSettings.newBuilder().setEndpoint(myEndpoint).build();

// Create both a cluster controller client and job controller client with the
// configured settings. The client only needs to be created once and can be reused for
// multiple requests. Using a try-with-resources closes the client, but this can also be done
// manually with the .close() method.
try (JobControllerClient jobControllerClient =
JobControllerClient.create(jobControllerSettings)) {

// Configure cluster placement for the job.
JobPlacement jobPlacement = JobPlacement.newBuilder().setClusterName(clusterName).build();

// Configure Hadoop job settings. The HadoopFS query is set here.
HadoopJob hadoopJob = HadoopJob.newBuilder()
.setMainClass("org.apache.hadoop.fs.FsShell")
.addArgs(hadoopFSQuery).build();

Job job = Job.newBuilder().setPlacement(jobPlacement).setHadoopJob(hadoopJob).build();

// Submit an asynchronous request to execute the job.
Job request = jobControllerClient.submitJob(projectId, region, job);
String jobId = request.getReference().getJobId();
System.out.println(String.format("Submitted job \"%s\"", jobId));

// Wait for the job to finish.
CompletableFuture<Job> finishedJobFuture =
CompletableFuture.supplyAsync(
() -> waitForJobCompletion(jobControllerClient, projectId, region, jobId));
int timeout = 10;
try {
Job jobInfo = finishedJobFuture.get(timeout, TimeUnit.MINUTES);
System.out.println(String.format("Job %s finished successfully.", jobId));

// Cloud Dataproc job output gets saved to a GCS bucket allocated to it.
Cluster clusterInfo = clusterControllerClient.getCluster(projectId, region, clusterName);
Storage storage = StorageOptions.getDefaultInstance().getService();
Blob blob =
storage.get(
clusterInfo.getConfig().getConfigBucket(),
String.format(
"google-cloud-dataproc-metainfo/%s/jobs/%s/driveroutput.000000000",
clusterInfo.getClusterUuid(), jobId));
System.out.println(
String.format(
"Job \"%s\" finished with state %s:%n%s",
jobId, jobInfo.getStatus().getState(), new String(blob.getContent())));
} catch (TimeoutException e) {
System.err.println(
String.format("Job timed out after %d minutes: %s", timeout, e.getMessage()));
}

// Delete the cluster.
OperationFuture<Empty, ClusterOperationMetadata> deleteClusterAsyncRequest =
clusterControllerClient.deleteClusterAsync(projectId, region, clusterName);
deleteClusterAsyncRequest.get();
System.out.println(String.format("Cluster \"%s\" successfully deleted.", clusterName));

} catch (ExecutionException e) {
System.err.println(String.format("Error executing quickstart: %s ", e.getMessage()));
}
}

public static void main(String... args) throws IOException, InterruptedException {
if (args.length != 4) {
System.err.println(
"Insufficient number of parameters provided. Please make sure a "
+ "PROJECT_ID, REGION, CLUSTER_NAME and JOB_FILE_PATH are provided, in this order.");
return;
}

String projectId = args[0]; // project-id of project to create the cluster in
String region = args[1]; // region to create the cluster
String clusterName = args[2]; // name of the cluster
String jobFilePath = args[3]; // location in GCS of the PySpark job

quickstart(projectId, region, clusterName, jobFilePath);
}
}
// [END dataproc_quickstart]
91 changes: 91 additions & 0 deletions dataproc/src/test/java/SubmitHadoopFSJobTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/*
* Copyright 2019 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import com.google.api.gax.longrunning.OperationFuture;
import com.google.cloud.dataproc.v1.ClusterControllerClient;
import com.google.cloud.dataproc.v1.ClusterControllerSettings;
import com.google.cloud.dataproc.v1.ClusterOperationMetadata;
import com.google.protobuf.Empty;
import org.hamcrest.CoreMatchers;
import org.junit.After;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.UUID;
import java.util.concurrent.ExecutionException;

import static junit.framework.TestCase.assertNotNull;
import static org.hamcrest.MatcherAssert.assertThat;

@RunWith(JUnit4.class)
public class SubmitHadoopFSJobTest {

private static final String CLUSTER_NAME =
String.format("java-cc-test--%s", UUID.randomUUID().toString());
private static final String REGION = "us-east1";
private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");

private ByteArrayOutputStream bout;

private static void requireEnv(String varName) {
assertNotNull(
String.format("Environment variable '%s' is required to perform these tests.", varName),
System.getenv(varName));
}

@BeforeClass
public static void checkRequirements() {
requireEnv("GOOGLE_APPLICATION_CREDENTIALS");
requireEnv("GOOGLE_CLOUD_PROJECT");
}

@Before
public void setUp() {
bout = new ByteArrayOutputStream();
System.setOut(new PrintStream(bout));
//System.setErr(new PrintStream(bout));
}

@Test
public void createClusterTest() throws IOException, InterruptedException {
CreateCluster.createCluster(PROJECT_ID, REGION, CLUSTER_NAME);
String output = bout.toString();

System.out.println(output);
assertThat(output, CoreMatchers.containsString(CLUSTER_NAME));
}

@After
public void tearDown() throws IOException, InterruptedException, ExecutionException {
String myEndpoint = String.format("%s-dataproc.googleapis.com:443", REGION);

ClusterControllerSettings clusterControllerSettings =
ClusterControllerSettings.newBuilder().setEndpoint(myEndpoint).build();

try (ClusterControllerClient clusterControllerClient =
ClusterControllerClient.create(clusterControllerSettings)) {
OperationFuture<Empty, ClusterOperationMetadata> deleteClusterAsyncRequest =
clusterControllerClient.deleteClusterAsync(PROJECT_ID, REGION, CLUSTER_NAME);
deleteClusterAsyncRequest.get();
}
}
}

0 comments on commit 933900b

Please sign in to comment.