feat: adds beta samples

* feat: adds remaining samples (parse form synchronous, parse table synchronous, parse from with NL model, set endpoint) * fix: adds AutoML NL model to tests * fix: removes forEach() constructions
GoogleCloudPlatform · Nov 14, 2022 · f72dff6 · f72dff6
1 parent 2c0f5a3
commit f72dff6
Show file tree

Hide file tree

Showing 12 changed files with 819 additions and 7 deletions.
diff --git a/document-ai/batch_parse_form.js b/document-ai/batch_parse_form.js
@@ -0,0 +1,140 @@
+/**
+ * Copyright 2020 Google LLC
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+'use strict';
+
+const uuid = require('uuid');
+
+async function main(
+  projectId = 'YOUR_PROJECT_ID',
+  location = 'YOUR_PROJECT_LOCATION',
+  gcsOutputUri = 'output-bucket',
+  gcsOutputUriPrefix = uuid.v4(),
+  gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
+) {
+  // [START document_parse_form]
+  /**
+   * TODO(developer): Uncomment these variables before running the sample.
+   */
+  // const projectId = 'YOUR_PROJECT_ID';
+  // const location = 'YOUR_PROJECT_LOCATION',
+  // const gcsOutputUri = 'YOUR_STORAGE_BUCKET';
+  // const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX';
+  // const gcsInputUri = 'GCS URI of the PDF to process';
+
+  // Imports the Google Cloud client library
+  const {
+    DocumentUnderstandingServiceClient,
+  } = require('@google-cloud/documentai');
+  const {Storage} = require('@google-cloud/storage');
+
+  const client = new DocumentUnderstandingServiceClient();
+  const storage = new Storage();
+
+  async function parseFormGCS(inputUri, outputUri, outputUriPrefix) {
+    const parent = `projects/${projectId}/locations/${location}`;
+
+    // Configure the batch process request.
+    const request = {
+      inputConfig: {
+        gcsSource: {
+          uri: inputUri,
+        },
+        mimeType: 'application/pdf',
+      },
+      outputConfig: {
+        gcsDestination: {
+          uri: `${outputUri}/${outputUriPrefix}/`,
+        },
+        pagesPerShard: 1,
+      },
+      formExtractionParams: {
+        enabled: true,
+        keyValuePairHints: [
+          {
+            key: 'Phone',
+            valueTypes: ['PHONE_NUMBER'],
+          },
+          {
+            key: 'Contact',
+            valueTypes: ['EMAIL', 'NAME'],
+          },
+        ],
+      },
+    };
+
+    // Configure the request for batch process
+    const requests = {
+      parent,
+      requests: [request],
+    };
+
+    // Batch process document using a long-running operation.
+    // You can wait for now, or get results later.
+    const [operation] = await client.batchProcessDocuments(requests);
+
+    // Wait for operation to complete.
+    await operation.promise();
+
+    console.log('Document processing complete.');
+
+    // Query Storage bucket for the results file(s).
+    const query = {
+      prefix: outputUriPrefix,
+    };
+
+    console.log('Fetching results ...');
+
+    // List all of the files in the Storage bucket
+    const [files] = await storage.bucket(gcsOutputUri).getFiles(query);
+
+    files.forEach(async (fileInfo, index) => {
+      // Get the file as a buffer
+      const [file] = await fileInfo.download();
+
+      console.log(`Fetched file #${index + 1}:`);
+
+      // Read the results
+      const results = JSON.parse(file.toString());
+
+      // Get all of the document text as one big string.
+      const {text} = results;
+
+      // Utility to extract text anchors from text field.
+      const getText = textAnchor => {
+        const startIndex = textAnchor.textSegments[0].startIndex || 0;
+        const endIndex = textAnchor.textSegments[0].endIndex;
+
+        return `\t${text.substring(startIndex, endIndex)}`;
+      };
+
+      // Process the output
+      const [page1] = results.pages;
+      const formFields = page1.formFields;
+
+      for (const field of formFields) {
+        const fieldName = getText(field.fieldName.textAnchor);
+        const fieldValue = getText(field.fieldValue.textAnchor);
+
+        console.log('Extracted key value pair:');
+        console.log(`\t(${fieldName}, ${fieldValue})`);
+      }
+    });
+  }
+  // [END document_parse_form]
+
+  parseFormGCS(gcsInputUri, gcsOutputUri, gcsOutputUriPrefix);
+}
+main(...process.argv.slice(2));
diff --git a/document-ai/batch_parse_table.js b/document-ai/batch_parse_table.js
@@ -0,0 +1,148 @@
+/**
+ * Copyright 2020 Google LLC
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+'use strict';
+
+const uuid = require('uuid');
+
+async function main(
+  projectId = 'YOUR_PROJECT_ID',
+  location = 'YOUR_PROJECT_LOCATION',
+  gcsOutputUri = 'output-bucket',
+  gcsOutputUriPrefix = uuid.v4(),
+  gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
+) {
+  // [START document_parse_table]
+  /**
+   * TODO(developer): Uncomment these variables before running the sample.
+   */
+  // const projectId = 'YOUR_PROJECT_ID';
+  // const location = 'YOUR_PROJECT_LOCATION';
+  // const gcsOutputUri = 'YOUR_STORAGE_BUCKET';
+  // const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX';
+  // const gcsInputUri = 'YOUR_SOURCE_PDF';
+
+  // Imports the Google Cloud client library
+  const {
+    DocumentUnderstandingServiceClient,
+  } = require('@google-cloud/documentai');
+  const {Storage} = require('@google-cloud/storage');
+
+  const client = new DocumentUnderstandingServiceClient();
+  const storage = new Storage();
+
+  async function parseTableGCS(inputUri, outputUri, outputUriPrefix) {
+    const parent = `projects/${projectId}/locations/${location}`;
+
+    // Configure the batch process request.
+    const request = {
+      //parent,
+      inputConfig: {
+        gcsSource: {
+          uri: inputUri,
+        },
+        mimeType: 'application/pdf',
+      },
+      outputConfig: {
+        gcsDestination: {
+          uri: `${outputUri}/${outputUriPrefix}/`,
+        },
+        pagesPerShard: 1,
+      },
+      tableExtractionParams: {
+        enabled: true,
+        tableBoundHints: [
+          {
+            boundingBox: {
+              normalizedVertices: [
+                {x: 0, y: 0},
+                {x: 1, y: 0},
+                {x: 1, y: 1},
+                {x: 0, y: 1},
+              ],
+            },
+          },
+        ],
+      },
+    };
+
+    // Configure the request for batch process
+    const requests = {
+      parent,
+      requests: [request],
+    };
+
+    // Batch process document using a long-running operation.
+    // You can wait for now, or get results later.
+    // Note: first request to the service takes longer than subsequent
+    // requests.
+    const [operation] = await client.batchProcessDocuments(requests);
+
+    // Wait for operation to complete.
+    await operation.promise();
+
+    console.log('Document processing complete.');
+
+    // Query Storage bucket for the results file(s).
+    const query = {
+      prefix: outputUriPrefix,
+    };
+
+    console.log('Fetching results ...');
+
+    // List all of the files in the Storage bucket
+    const [files] = await storage.bucket(gcsOutputUri).getFiles(query);
+
+    files.forEach(async (fileInfo, index) => {
+      // Get the file as a buffer
+      const [file] = await fileInfo.download();
+
+      console.log(`Fetched file #${index + 1}:`);
+
+      // Read the results
+      const results = JSON.parse(file.toString());
+
+      // Get all of the document text as one big string
+      const text = results.text;
+
+      // Get the first table in the document
+      const [page1] = results.pages;
+      const [table] = page1.tables;
+      const [headerRow] = table.headerRows;
+
+      console.log('Results from first table processed:');
+      console.log(
+        `First detected language: ${page1.detectedLanguages[0].languageCode}`
+      );
+
+      console.log('Header row:');
+      for (const tableCell of headerRow.cells) {
+        if (tableCell.layout.textAnchor.textSegments) {
+          // Extract shards from the text field
+          // First shard in document doesn't have startIndex property
+          const startIndex =
+            tableCell.layout.textAnchor.textSegments[0].startIndex || 0;
+          const endIndex = tableCell.layout.textAnchor.textSegments[0].endIndex;
+
+          console.log(`\t${text.substring(startIndex, endIndex)}`);
+        }
+      }
+    });
+  }
+  // [END document_parse_table]
+
+  parseTableGCS(gcsInputUri, gcsOutputUri, gcsOutputUriPrefix);
+}
+main(...process.argv.slice(2));
diff --git a/document-ai/parse_form.js b/document-ai/parse_form.js
@@ -0,0 +1,102 @@
+/**
+ * Copyright 2020, Google, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+'use strict';
+
+/**
+ * Process a single PDF as a form.
+ * @param {string} projectId your Google Cloud project ID
+ * @param {string} location region to use for this operation
+ * @param {string} gcsInputUri Cloud Storage URI of the PDF document to parse
+ */
+async function main(
+  projectId,
+  location,
+  gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
+) {
+  // [START document_quickstart]
+  /**
+   * TODO(developer): Uncomment these variables before running the sample.
+   */
+  // const projectId = 'YOUR_PROJECT_ID';
+  // const location = 'YOUR_PROJECT_LOCATION';
+  // const gcsInputUri = 'YOUR_SOURCE_PDF';
+
+  const {
+    DocumentUnderstandingServiceClient,
+  } = require('@google-cloud/documentai');
+  const client = new DocumentUnderstandingServiceClient();
+
+  async function parseForm() {
+    // Configure the request for processing the PDF
+    const parent = `projects/${projectId}/locations/${location}`;
+    const request = {
+      parent,
+      inputConfig: {
+        gcsSource: {
+          uri: gcsInputUri,
+        },
+        mimeType: 'application/pdf',
+      },
+      formExtractionParams: {
+        enabled: true,
+        keyValuePairHints: [
+          {
+            key: 'Phone',
+            valueTypes: ['PHONE_NUMBER'],
+          },
+          {
+            key: 'Contact',
+            valueTypes: ['EMAIL', 'NAME'],
+          },
+        ],
+      },
+    };
+
+    // Recognizes text entities in the PDF document
+    const [result] = await client.processDocument(request);
+
+    // Get all of the document text as one big string
+    const {text} = result;
+
+    // Extract shards from the text field
+    const getText = textAnchor => {
+      // First shard in document doesn't have startIndex property
+      const startIndex = textAnchor.textSegments[0].startIndex || 0;
+      const endIndex = textAnchor.textSegments[0].endIndex;
+
+      return text.substring(startIndex, endIndex);
+    };
+
+    // Process the output
+    const [page1] = result.pages;
+    const {formFields} = page1;
+
+    for (const field of formFields) {
+      const fieldName = getText(field.fieldName.textAnchor);
+      const fieldValue = getText(field.fieldValue.textAnchor);
+
+      console.log('Extracted key value pair:');
+      console.log(`\t(${fieldName}, ${fieldValue})`);
+    }
+  }
+  // [END document_quickstart]
+  await parseForm();
+}
+
+main(...process.argv.slice(2)).catch(err => {
+  console.error(err);
+  process.exitCode = 1;
+});