Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sample: update flaky dataproc sample and test to be more stable #5665

Merged
merged 13 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 57 additions & 104 deletions packages/google-cloud-dataproc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,113 +57,66 @@ npm install @google-cloud/dataproc
### Using the client library

```javascript
// This quickstart sample walks a user through creating a Dataproc
// cluster, submitting a PySpark job from Google Cloud Storage to the
// cluster, reading the output of the job and deleting the cluster, all
// using the Node.js client library.

'use strict';

function main(projectId, region, clusterName, jobFilePath) {
const dataproc = require('@google-cloud/dataproc');
const {Storage} = require('@google-cloud/storage');

// Create a cluster client with the endpoint set to the desired cluster region
const clusterClient = new dataproc.v1.ClusterControllerClient({
apiEndpoint: `${region}-dataproc.googleapis.com`,
projectId: projectId,
});

// Create a job client with the endpoint set to the desired cluster region
const jobClient = new dataproc.v1.JobControllerClient({
apiEndpoint: `${region}-dataproc.googleapis.com`,
projectId: projectId,
});

async function quickstart() {
// Create the cluster config
const cluster = {
projectId: projectId,
region: region,
cluster: {
clusterName: clusterName,
config: {
masterConfig: {
numInstances: 1,
machineTypeUri: 'n1-standard-2',
},
workerConfig: {
numInstances: 2,
machineTypeUri: 'n1-standard-2',
},
},
},
};

// Create the cluster
const [operation] = await clusterClient.createCluster(cluster);
const [response] = await operation.promise();

// Output a success message
console.log(`Cluster created successfully: ${response.clusterName}`);

const job = {
projectId: projectId,
region: region,
job: {
placement: {
clusterName: clusterName,
},
pysparkJob: {
mainPythonFileUri: jobFilePath,
},
},
};

const [jobOperation] = await jobClient.submitJobAsOperation(job);
const [jobResponse] = await jobOperation.promise();

const matches =
jobResponse.driverOutputResourceUri.match('gs://(.*?)/(.*)');

const storage = new Storage();

const output = await storage
.bucket(matches[1])
.file(`${matches[2]}.000000000`)
.download();

// Output a success message.
console.log(`Job finished successfully: ${output}`);

// Delete the cluster once the job has terminated.
const deleteClusterReq = {
projectId: projectId,
region: region,
clusterName: clusterName,
};

const [deleteOperation] =
await clusterClient.deleteCluster(deleteClusterReq);
await deleteOperation.promise();

// Output a success message
console.log(`Cluster ${clusterName} successfully deleted.`);
/**
* TODO(developer): Uncomment these variables before running the sample.
*/
/**
* Required. The ID of the Google Cloud Platform project that the cluster
* belongs to.
*/
// const projectId = 'abc123'
/**
* Required. The Dataproc region in which to handle the request.
*/
// const region = 'us-central1'
/**
* Optional. A filter constraining the clusters to list. Filters are
* case-sensitive and have the following syntax:
* field = value AND field = value ...
* where **field** is one of `status.state`, `clusterName`, or `labels.KEY`,
* and `[KEY]` is a label key. **value** can be `*` to match all values.
* `status.state` can be one of the following: `ACTIVE`, `INACTIVE`,
* `CREATING`, `RUNNING`, `ERROR`, `DELETING`, or `UPDATING`. `ACTIVE`
* contains the `CREATING`, `UPDATING`, and `RUNNING` states. `INACTIVE`
* contains the `DELETING` and `ERROR` states.
* `clusterName` is the name of the cluster provided at creation time.
* Only the logical `AND` operator is supported; space-separated items are
* treated as having an implicit `AND` operator.
* Example filter:
* status.state = ACTIVE AND clusterName = mycluster
* AND labels.env = staging AND labels.starred = *
*/
// const filter = 'abc123'
/**
* Optional. The standard List page size.
*/
// const pageSize = 1234
/**
* Optional. The standard List page token.
*/
// const pageToken = 'abc123'

// Imports the Dataproc library
const {ClusterControllerClient} = require('@google-cloud/dataproc').v1;

// Instantiates a client
const dataprocClient = new ClusterControllerClient();

async function callListClusters() {
// Construct request
const request = {
projectId,
region,
};

// Run request
const iterable = dataprocClient.listClustersAsync(request);
for await (const response of iterable) {
console.log(response);
}

quickstart();
}

const args = process.argv.slice(2);

if (args.length !== 4) {
console.log(
'Insufficient number of parameters provided. Please make sure a ' +
'PROJECT_ID, REGION, CLUSTER_NAME and JOB_FILE_PATH are provided, in this order.'
);
}

main(...args);
callListClusters();

```

Expand Down
2 changes: 1 addition & 1 deletion packages/google-cloud-dataproc/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,4 @@
"typescript": "^5.1.6"
},
"homepage": "https://github.com/googleapis/google-cloud-node/tree/main/packages/google-cloud-dataproc"
}
}
2 changes: 1 addition & 1 deletion packages/google-cloud-dataproc/samples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -830,7 +830,7 @@ View the [source code](https://github.com/googleapis/google-cloud-node/blob/main
__Usage:__


`node quickstart.js <PROJECT_ID> <REGION> <CLUSTER_NAME> <JOB_FILE_PATH>`
`node packages/google-cloud-dataproc/samples/quickstart.js`



Expand Down
2 changes: 1 addition & 1 deletion packages/google-cloud-dataproc/samples/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
"mocha": "^8.0.0",
"uuid": "^9.0.0"
}
}
}
173 changes: 66 additions & 107 deletions packages/google-cloud-dataproc/samples/quickstart.js
Original file line number Diff line number Diff line change
@@ -1,127 +1,86 @@
// Copyright 2017 Google LLC
// Copyright 2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// sample-metadata:
// title: Quickstart
// usage: node quickstart.js <PROJECT_ID> <REGION> <CLUSTER_NAME> <JOB_FILE_PATH>

// [START dataproc_quickstart]
// This quickstart sample walks a user through creating a Dataproc
// cluster, submitting a PySpark job from Google Cloud Storage to the
// cluster, reading the output of the job and deleting the cluster, all
// using the Node.js client library.

'use strict';

function main(projectId, region, clusterName, jobFilePath) {
const dataproc = require('@google-cloud/dataproc');
const {Storage} = require('@google-cloud/storage');

// Create a cluster client with the endpoint set to the desired cluster region
const clusterClient = new dataproc.v1.ClusterControllerClient({
apiEndpoint: `${region}-dataproc.googleapis.com`,
projectId: projectId,
});

// Create a job client with the endpoint set to the desired cluster region
const jobClient = new dataproc.v1.JobControllerClient({
apiEndpoint: `${region}-dataproc.googleapis.com`,
projectId: projectId,
});

async function quickstart() {
// Create the cluster config
const cluster = {
projectId: projectId,
region: region,
cluster: {
clusterName: clusterName,
config: {
masterConfig: {
numInstances: 1,
machineTypeUri: 'n1-standard-2',
},
workerConfig: {
numInstances: 2,
machineTypeUri: 'n1-standard-2',
},
},
},
};

// Create the cluster
const [operation] = await clusterClient.createCluster(cluster);
const [response] = await operation.promise();

// Output a success message
console.log(`Cluster created successfully: ${response.clusterName}`);

const job = {
projectId: projectId,
region: region,
job: {
placement: {
clusterName: clusterName,
},
pysparkJob: {
mainPythonFileUri: jobFilePath,
},
},
};

const [jobOperation] = await jobClient.submitJobAsOperation(job);
const [jobResponse] = await jobOperation.promise();

const matches =
jobResponse.driverOutputResourceUri.match('gs://(.*?)/(.*)');

const storage = new Storage();

const output = await storage
.bucket(matches[1])
.file(`${matches[2]}.000000000`)
.download();

// Output a success message.
console.log(`Job finished successfully: ${output}`);

// Delete the cluster once the job has terminated.
const deleteClusterReq = {
projectId: projectId,
region: region,
clusterName: clusterName,
function main(projectId, region) {
// [START dataproc_v1_generated_quickstart]
/**
* TODO(developer): Uncomment these variables before running the sample.
*/
/**
* Required. The ID of the Google Cloud Platform project that the cluster
* belongs to.
*/
// const projectId = 'abc123'
/**
* Required. The Dataproc region in which to handle the request.
*/
// const region = 'us-central1'
/**
* Optional. A filter constraining the clusters to list. Filters are
* case-sensitive and have the following syntax:
* field = value AND field = value ...
* where **field** is one of `status.state`, `clusterName`, or `labels.KEY`,
* and `[KEY]` is a label key. **value** can be `*` to match all values.
* `status.state` can be one of the following: `ACTIVE`, `INACTIVE`,
* `CREATING`, `RUNNING`, `ERROR`, `DELETING`, or `UPDATING`. `ACTIVE`
* contains the `CREATING`, `UPDATING`, and `RUNNING` states. `INACTIVE`
* contains the `DELETING` and `ERROR` states.
* `clusterName` is the name of the cluster provided at creation time.
* Only the logical `AND` operator is supported; space-separated items are
* treated as having an implicit `AND` operator.
* Example filter:
* status.state = ACTIVE AND clusterName = mycluster
* AND labels.env = staging AND labels.starred = *
*/
// const filter = 'abc123'
/**
* Optional. The standard List page size.
*/
// const pageSize = 1234
/**
* Optional. The standard List page token.
*/
// const pageToken = 'abc123'

// Imports the Dataproc library
const {ClusterControllerClient} = require('@google-cloud/dataproc').v1;

// Instantiates a client
const dataprocClient = new ClusterControllerClient();

async function callListClusters() {
// Construct request
const request = {
projectId,
region,
};

const [deleteOperation] =
await clusterClient.deleteCluster(deleteClusterReq);
await deleteOperation.promise();

// Output a success message
console.log(`Cluster ${clusterName} successfully deleted.`);
// Run request
const iterable = dataprocClient.listClustersAsync(request);
for await (const response of iterable) {
console.log(response);
}
}

quickstart();
}

const args = process.argv.slice(2);

if (args.length !== 4) {
console.log(
'Insufficient number of parameters provided. Please make sure a ' +
'PROJECT_ID, REGION, CLUSTER_NAME and JOB_FILE_PATH are provided, in this order.'
);
callListClusters();
// [END dataproc_v1_generated_quickstart]
}

main(...args);
// [END dataproc_quickstart]
process.on('unhandledRejection', err => {
console.error(err.message);
process.exitCode = 1;
});
main(...process.argv.slice(2));
Loading
Loading