-
Notifications
You must be signed in to change notification settings - Fork 6.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
samples: migrate v1beta2 doc AI samples (#79)
* samples: migrate v1beta2 doc AI samples * added noxfile * reformatted code * organized imports in right order * lint * finally fixed lint * reorganized folders * imports * added from prefix imports * renamed files * renamed package on tests files * nit
- Loading branch information
1 parent
984627e
commit deeba8e
Showing
15 changed files
with
825 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
# [START documentai_batch_parse_form_beta] | ||
import re | ||
|
||
from google.cloud import documentai_v1beta2 as documentai | ||
from google.cloud import storage | ||
|
||
|
||
def batch_parse_form( | ||
project_id="YOUR_PROJECT_ID", | ||
input_uri="gs://cloud-samples-data/documentai/form.pdf", | ||
destination_uri="gs://your-bucket-id/path/to/save/results/", | ||
): | ||
"""Parse a form""" | ||
|
||
client = documentai.DocumentUnderstandingServiceClient() | ||
|
||
gcs_source = documentai.types.GcsSource(uri=input_uri) | ||
|
||
# mime_type can be application/pdf, image/tiff, | ||
# and image/gif, or application/json | ||
input_config = documentai.types.InputConfig( | ||
gcs_source=gcs_source, mime_type="application/pdf" | ||
) | ||
|
||
# where to write results | ||
output_config = documentai.types.OutputConfig( | ||
gcs_destination=documentai.types.GcsDestination(uri=destination_uri), | ||
pages_per_shard=1, # Map one doc page to one output page | ||
) | ||
|
||
# Improve form parsing results by providing key-value pair hints. | ||
# For each key hint, key is text that is likely to appear in the | ||
# document as a form field name (i.e. "DOB"). | ||
# Value types are optional, but can be one or more of: | ||
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, | ||
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME | ||
key_value_pair_hints = [ | ||
documentai.types.KeyValuePairHint( | ||
key="Emergency Contact", value_types=["NAME"] | ||
), | ||
documentai.types.KeyValuePairHint(key="Referred By"), | ||
] | ||
|
||
# Setting enabled=True enables form extraction | ||
form_extraction_params = documentai.types.FormExtractionParams( | ||
enabled=True, key_value_pair_hints=key_value_pair_hints | ||
) | ||
|
||
# Location can be 'us' or 'eu' | ||
parent = "projects/{}/locations/us".format(project_id) | ||
request = documentai.types.ProcessDocumentRequest( | ||
input_config=input_config, | ||
output_config=output_config, | ||
form_extraction_params=form_extraction_params, | ||
) | ||
|
||
# Add each ProcessDocumentRequest to the batch request | ||
requests = [] | ||
requests.append(request) | ||
|
||
batch_request = documentai.types.BatchProcessDocumentsRequest( | ||
parent=parent, requests=requests | ||
) | ||
|
||
operation = client.batch_process_documents(batch_request) | ||
|
||
# Wait for the operation to finish | ||
operation.result() | ||
|
||
# Results are written to GCS. Use a regex to find | ||
# output files | ||
match = re.match(r"gs://([^/]+)/(.+)", destination_uri) | ||
output_bucket = match.group(1) | ||
prefix = match.group(2) | ||
|
||
storage_client = storage.client.Client() | ||
bucket = storage_client.get_bucket(output_bucket) | ||
blob_list = list(bucket.list_blobs(prefix=prefix)) | ||
print("Output files:") | ||
for blob in blob_list: | ||
print(blob.name) | ||
|
||
|
||
# [END documentai_batch_parse_form_beta] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific ladnguage governing permissions and | ||
# limitations under the License. | ||
|
||
import os | ||
import uuid | ||
|
||
from google.cloud import storage | ||
|
||
import pytest | ||
|
||
from samples.snippets import batch_parse_form_v1beta2 | ||
|
||
|
||
BUCKET = "document-ai-{}".format(uuid.uuid4()) | ||
OUTPUT_PREFIX = "TEST_OUTPUT_{}".format(uuid.uuid4()) | ||
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"] | ||
INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf" | ||
BATCH_OUTPUT_URI = "gs://{}/{}/".format(BUCKET, OUTPUT_PREFIX) | ||
|
||
|
||
@pytest.fixture(autouse=True) | ||
def setup_teardown(): | ||
"""Create a temporary bucket to store annotation output.""" | ||
storage_client = storage.Client() | ||
bucket = storage_client.create_bucket(BUCKET) | ||
|
||
yield | ||
|
||
bucket.delete(force=True) | ||
|
||
|
||
def test_batch_parse_form(capsys): | ||
batch_parse_form_v1beta2.batch_parse_form(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI) | ||
out, _ = capsys.readouterr() | ||
assert "Output files" in out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
# [START documentai_batch_parse_table_beta] | ||
import re | ||
|
||
from google.cloud import documentai_v1beta2 as documentai | ||
from google.cloud import storage | ||
|
||
|
||
def batch_parse_table( | ||
project_id="YOUR_PROJECT_ID", | ||
input_uri="gs://cloud-samples-data/documentai/form.pdf", | ||
destination_uri="gs://your-bucket-id/path/to/save/results/", | ||
): | ||
"""Parse a form""" | ||
|
||
client = documentai.DocumentUnderstandingServiceClient() | ||
|
||
gcs_source = documentai.types.GcsSource(uri=input_uri) | ||
|
||
# mime_type can be application/pdf, image/tiff, | ||
# and image/gif, or application/json | ||
input_config = documentai.types.InputConfig( | ||
gcs_source=gcs_source, mime_type="application/pdf" | ||
) | ||
|
||
# where to write results | ||
output_config = documentai.types.OutputConfig( | ||
gcs_destination=documentai.types.GcsDestination(uri=destination_uri), | ||
pages_per_shard=1, # Map one doc page to one output page | ||
) | ||
|
||
# Improve table parsing results by providing bounding boxes | ||
# specifying where the box appears in the document (optional) | ||
table_bound_hints = [ | ||
documentai.types.TableBoundHint( | ||
page_number=1, | ||
bounding_box=documentai.types.BoundingPoly( | ||
# Define a polygon around tables to detect | ||
# Each vertice coordinate must be a number between 0 and 1 | ||
normalized_vertices=[ | ||
# Top left | ||
documentai.types.geometry.NormalizedVertex(x=0, y=0), | ||
# Top right | ||
documentai.types.geometry.NormalizedVertex(x=1, y=0), | ||
# Bottom right | ||
documentai.types.geometry.NormalizedVertex(x=1, y=1), | ||
# Bottom left | ||
documentai.types.geometry.NormalizedVertex(x=0, y=1), | ||
] | ||
), | ||
) | ||
] | ||
|
||
# Setting enabled=True enables form extraction | ||
table_extraction_params = documentai.types.TableExtractionParams( | ||
enabled=True, table_bound_hints=table_bound_hints | ||
) | ||
|
||
# Location can be 'us' or 'eu' | ||
parent = "projects/{}/locations/us".format(project_id) | ||
request = documentai.types.ProcessDocumentRequest( | ||
input_config=input_config, | ||
output_config=output_config, | ||
table_extraction_params=table_extraction_params, | ||
) | ||
|
||
requests = [] | ||
requests.append(request) | ||
|
||
batch_request = documentai.types.BatchProcessDocumentsRequest( | ||
parent=parent, requests=requests | ||
) | ||
|
||
operation = client.batch_process_documents(batch_request) | ||
|
||
# Wait for the operation to finish | ||
operation.result() | ||
|
||
# Results are written to GCS. Use a regex to find | ||
# output files | ||
match = re.match(r"gs://([^/]+)/(.+)", destination_uri) | ||
output_bucket = match.group(1) | ||
prefix = match.group(2) | ||
|
||
storage_client = storage.client.Client() | ||
bucket = storage_client.get_bucket(output_bucket) | ||
blob_list = list(bucket.list_blobs(prefix=prefix)) | ||
print("Output files:") | ||
for blob in blob_list: | ||
print(blob.name) | ||
|
||
|
||
# [END documentai_batch_parse_table_beta] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific ladnguage governing permissions and | ||
# limitations under the License. | ||
|
||
import os | ||
import uuid | ||
|
||
from google.cloud import storage | ||
|
||
import pytest | ||
|
||
from samples.snippets import batch_parse_table_v1beta2 | ||
|
||
|
||
BUCKET = "document-ai-{}".format(uuid.uuid4()) | ||
OUTPUT_PREFIX = "TEST_OUTPUT_{}".format(uuid.uuid4()) | ||
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"] | ||
INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf" | ||
BATCH_OUTPUT_URI = "gs://{}/{}/".format(BUCKET, OUTPUT_PREFIX) | ||
|
||
|
||
@pytest.fixture(autouse=True) | ||
def setup_teardown(): | ||
"""Create a temporary bucket to store annotation output.""" | ||
storage_client = storage.Client() | ||
bucket = storage_client.create_bucket(BUCKET) | ||
|
||
yield | ||
|
||
bucket.delete(force=True) | ||
|
||
|
||
def test_batch_parse_table(capsys): | ||
batch_parse_table_v1beta2.batch_parse_table(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI) | ||
out, _ = capsys.readouterr() | ||
assert "Output files:" in out |
Oops, something went wrong.