Skip to content

Commit

Permalink
feat: adds beta samples
Browse files Browse the repository at this point in the history
* feat: adds remaining samples (parse form synchronous, parse table synchronous, parse from with NL model, set endpoint)

* fix: adds AutoML NL model to tests

* fix: removes forEach() constructions
  • Loading branch information
telpirion authored and Ace Nassri committed Nov 14, 2022
1 parent 2c0f5a3 commit f72dff6
Show file tree
Hide file tree
Showing 12 changed files with 819 additions and 7 deletions.
140 changes: 140 additions & 0 deletions document-ai/batch_parse_form.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/**
* Copyright 2020 Google LLC
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

'use strict';

const uuid = require('uuid');

async function main(
projectId = 'YOUR_PROJECT_ID',
location = 'YOUR_PROJECT_LOCATION',
gcsOutputUri = 'output-bucket',
gcsOutputUriPrefix = uuid.v4(),
gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
) {
// [START document_parse_form]
/**
* TODO(developer): Uncomment these variables before running the sample.
*/
// const projectId = 'YOUR_PROJECT_ID';
// const location = 'YOUR_PROJECT_LOCATION',
// const gcsOutputUri = 'YOUR_STORAGE_BUCKET';
// const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX';
// const gcsInputUri = 'GCS URI of the PDF to process';

// Imports the Google Cloud client library
const {
DocumentUnderstandingServiceClient,
} = require('@google-cloud/documentai');
const {Storage} = require('@google-cloud/storage');

const client = new DocumentUnderstandingServiceClient();
const storage = new Storage();

async function parseFormGCS(inputUri, outputUri, outputUriPrefix) {
const parent = `projects/${projectId}/locations/${location}`;

// Configure the batch process request.
const request = {
inputConfig: {
gcsSource: {
uri: inputUri,
},
mimeType: 'application/pdf',
},
outputConfig: {
gcsDestination: {
uri: `${outputUri}/${outputUriPrefix}/`,
},
pagesPerShard: 1,
},
formExtractionParams: {
enabled: true,
keyValuePairHints: [
{
key: 'Phone',
valueTypes: ['PHONE_NUMBER'],
},
{
key: 'Contact',
valueTypes: ['EMAIL', 'NAME'],
},
],
},
};

// Configure the request for batch process
const requests = {
parent,
requests: [request],
};

// Batch process document using a long-running operation.
// You can wait for now, or get results later.
const [operation] = await client.batchProcessDocuments(requests);

// Wait for operation to complete.
await operation.promise();

console.log('Document processing complete.');

// Query Storage bucket for the results file(s).
const query = {
prefix: outputUriPrefix,
};

console.log('Fetching results ...');

// List all of the files in the Storage bucket
const [files] = await storage.bucket(gcsOutputUri).getFiles(query);

files.forEach(async (fileInfo, index) => {
// Get the file as a buffer
const [file] = await fileInfo.download();

console.log(`Fetched file #${index + 1}:`);

// Read the results
const results = JSON.parse(file.toString());

// Get all of the document text as one big string.
const {text} = results;

// Utility to extract text anchors from text field.
const getText = textAnchor => {
const startIndex = textAnchor.textSegments[0].startIndex || 0;
const endIndex = textAnchor.textSegments[0].endIndex;

return `\t${text.substring(startIndex, endIndex)}`;
};

// Process the output
const [page1] = results.pages;
const formFields = page1.formFields;

for (const field of formFields) {
const fieldName = getText(field.fieldName.textAnchor);
const fieldValue = getText(field.fieldValue.textAnchor);

console.log('Extracted key value pair:');
console.log(`\t(${fieldName}, ${fieldValue})`);
}
});
}
// [END document_parse_form]

parseFormGCS(gcsInputUri, gcsOutputUri, gcsOutputUriPrefix);
}
main(...process.argv.slice(2));
148 changes: 148 additions & 0 deletions document-ai/batch_parse_table.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/**
* Copyright 2020 Google LLC
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

'use strict';

const uuid = require('uuid');

async function main(
projectId = 'YOUR_PROJECT_ID',
location = 'YOUR_PROJECT_LOCATION',
gcsOutputUri = 'output-bucket',
gcsOutputUriPrefix = uuid.v4(),
gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
) {
// [START document_parse_table]
/**
* TODO(developer): Uncomment these variables before running the sample.
*/
// const projectId = 'YOUR_PROJECT_ID';
// const location = 'YOUR_PROJECT_LOCATION';
// const gcsOutputUri = 'YOUR_STORAGE_BUCKET';
// const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX';
// const gcsInputUri = 'YOUR_SOURCE_PDF';

// Imports the Google Cloud client library
const {
DocumentUnderstandingServiceClient,
} = require('@google-cloud/documentai');
const {Storage} = require('@google-cloud/storage');

const client = new DocumentUnderstandingServiceClient();
const storage = new Storage();

async function parseTableGCS(inputUri, outputUri, outputUriPrefix) {
const parent = `projects/${projectId}/locations/${location}`;

// Configure the batch process request.
const request = {
//parent,
inputConfig: {
gcsSource: {
uri: inputUri,
},
mimeType: 'application/pdf',
},
outputConfig: {
gcsDestination: {
uri: `${outputUri}/${outputUriPrefix}/`,
},
pagesPerShard: 1,
},
tableExtractionParams: {
enabled: true,
tableBoundHints: [
{
boundingBox: {
normalizedVertices: [
{x: 0, y: 0},
{x: 1, y: 0},
{x: 1, y: 1},
{x: 0, y: 1},
],
},
},
],
},
};

// Configure the request for batch process
const requests = {
parent,
requests: [request],
};

// Batch process document using a long-running operation.
// You can wait for now, or get results later.
// Note: first request to the service takes longer than subsequent
// requests.
const [operation] = await client.batchProcessDocuments(requests);

// Wait for operation to complete.
await operation.promise();

console.log('Document processing complete.');

// Query Storage bucket for the results file(s).
const query = {
prefix: outputUriPrefix,
};

console.log('Fetching results ...');

// List all of the files in the Storage bucket
const [files] = await storage.bucket(gcsOutputUri).getFiles(query);

files.forEach(async (fileInfo, index) => {
// Get the file as a buffer
const [file] = await fileInfo.download();

console.log(`Fetched file #${index + 1}:`);

// Read the results
const results = JSON.parse(file.toString());

// Get all of the document text as one big string
const text = results.text;

// Get the first table in the document
const [page1] = results.pages;
const [table] = page1.tables;
const [headerRow] = table.headerRows;

console.log('Results from first table processed:');
console.log(
`First detected language: ${page1.detectedLanguages[0].languageCode}`
);

console.log('Header row:');
for (const tableCell of headerRow.cells) {
if (tableCell.layout.textAnchor.textSegments) {
// Extract shards from the text field
// First shard in document doesn't have startIndex property
const startIndex =
tableCell.layout.textAnchor.textSegments[0].startIndex || 0;
const endIndex = tableCell.layout.textAnchor.textSegments[0].endIndex;

console.log(`\t${text.substring(startIndex, endIndex)}`);
}
}
});
}
// [END document_parse_table]

parseTableGCS(gcsInputUri, gcsOutputUri, gcsOutputUriPrefix);
}
main(...process.argv.slice(2));
102 changes: 102 additions & 0 deletions document-ai/parse_form.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/**
* Copyright 2020, Google, Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

'use strict';

/**
* Process a single PDF as a form.
* @param {string} projectId your Google Cloud project ID
* @param {string} location region to use for this operation
* @param {string} gcsInputUri Cloud Storage URI of the PDF document to parse
*/
async function main(
projectId,
location,
gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
) {
// [START document_quickstart]
/**
* TODO(developer): Uncomment these variables before running the sample.
*/
// const projectId = 'YOUR_PROJECT_ID';
// const location = 'YOUR_PROJECT_LOCATION';
// const gcsInputUri = 'YOUR_SOURCE_PDF';

const {
DocumentUnderstandingServiceClient,
} = require('@google-cloud/documentai');
const client = new DocumentUnderstandingServiceClient();

async function parseForm() {
// Configure the request for processing the PDF
const parent = `projects/${projectId}/locations/${location}`;
const request = {
parent,
inputConfig: {
gcsSource: {
uri: gcsInputUri,
},
mimeType: 'application/pdf',
},
formExtractionParams: {
enabled: true,
keyValuePairHints: [
{
key: 'Phone',
valueTypes: ['PHONE_NUMBER'],
},
{
key: 'Contact',
valueTypes: ['EMAIL', 'NAME'],
},
],
},
};

// Recognizes text entities in the PDF document
const [result] = await client.processDocument(request);

// Get all of the document text as one big string
const {text} = result;

// Extract shards from the text field
const getText = textAnchor => {
// First shard in document doesn't have startIndex property
const startIndex = textAnchor.textSegments[0].startIndex || 0;
const endIndex = textAnchor.textSegments[0].endIndex;

return text.substring(startIndex, endIndex);
};

// Process the output
const [page1] = result.pages;
const {formFields} = page1;

for (const field of formFields) {
const fieldName = getText(field.fieldName.textAnchor);
const fieldValue = getText(field.fieldValue.textAnchor);

console.log('Extracted key value pair:');
console.log(`\t(${fieldName}, ${fieldValue})`);
}
}
// [END document_quickstart]
await parseForm();
}

main(...process.argv.slice(2)).catch(err => {
console.error(err);
process.exitCode = 1;
});
Loading

0 comments on commit f72dff6

Please sign in to comment.