Skip to content

Commit

Permalink
samples: migrate v1beta2 doc AI samples (#79)
Browse files Browse the repository at this point in the history
* samples: migrate v1beta2 doc AI samples

* added noxfile

* reformatted code

* organized imports in right order

* lint

* finally fixed lint

* reorganized folders

* imports

* added from prefix imports

* renamed files

* renamed package on tests files

* nit
  • Loading branch information
munkhuushmgl authored and holtskinner committed Jan 3, 2023
1 parent efb2acc commit 7b2f8c9
Show file tree
Hide file tree
Showing 15 changed files with 825 additions and 18 deletions.
99 changes: 99 additions & 0 deletions batch_parse_form_v1beta2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# [START documentai_batch_parse_form_beta]
import re

from google.cloud import documentai_v1beta2 as documentai
from google.cloud import storage


def batch_parse_form(
project_id="YOUR_PROJECT_ID",
input_uri="gs://cloud-samples-data/documentai/form.pdf",
destination_uri="gs://your-bucket-id/path/to/save/results/",
):
"""Parse a form"""

client = documentai.DocumentUnderstandingServiceClient()

gcs_source = documentai.types.GcsSource(uri=input_uri)

# mime_type can be application/pdf, image/tiff,
# and image/gif, or application/json
input_config = documentai.types.InputConfig(
gcs_source=gcs_source, mime_type="application/pdf"
)

# where to write results
output_config = documentai.types.OutputConfig(
gcs_destination=documentai.types.GcsDestination(uri=destination_uri),
pages_per_shard=1, # Map one doc page to one output page
)

# Improve form parsing results by providing key-value pair hints.
# For each key hint, key is text that is likely to appear in the
# document as a form field name (i.e. "DOB").
# Value types are optional, but can be one or more of:
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
key_value_pair_hints = [
documentai.types.KeyValuePairHint(
key="Emergency Contact", value_types=["NAME"]
),
documentai.types.KeyValuePairHint(key="Referred By"),
]

# Setting enabled=True enables form extraction
form_extraction_params = documentai.types.FormExtractionParams(
enabled=True, key_value_pair_hints=key_value_pair_hints
)

# Location can be 'us' or 'eu'
parent = "projects/{}/locations/us".format(project_id)
request = documentai.types.ProcessDocumentRequest(
input_config=input_config,
output_config=output_config,
form_extraction_params=form_extraction_params,
)

# Add each ProcessDocumentRequest to the batch request
requests = []
requests.append(request)

batch_request = documentai.types.BatchProcessDocumentsRequest(
parent=parent, requests=requests
)

operation = client.batch_process_documents(batch_request)

# Wait for the operation to finish
operation.result()

# Results are written to GCS. Use a regex to find
# output files
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
output_bucket = match.group(1)
prefix = match.group(2)

storage_client = storage.client.Client()
bucket = storage_client.get_bucket(output_bucket)
blob_list = list(bucket.list_blobs(prefix=prefix))
print("Output files:")
for blob in blob_list:
print(blob.name)


# [END documentai_batch_parse_form_beta]
46 changes: 46 additions & 0 deletions batch_parse_form_v1beta2_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific ladnguage governing permissions and
# limitations under the License.

import os
import uuid

from google.cloud import storage

import pytest

from samples.snippets import batch_parse_form_v1beta2


BUCKET = "document-ai-{}".format(uuid.uuid4())
OUTPUT_PREFIX = "TEST_OUTPUT_{}".format(uuid.uuid4())
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf"
BATCH_OUTPUT_URI = "gs://{}/{}/".format(BUCKET, OUTPUT_PREFIX)


@pytest.fixture(autouse=True)
def setup_teardown():
"""Create a temporary bucket to store annotation output."""
storage_client = storage.Client()
bucket = storage_client.create_bucket(BUCKET)

yield

bucket.delete(force=True)


def test_batch_parse_form(capsys):
batch_parse_form_v1beta2.batch_parse_form(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
out, _ = capsys.readouterr()
assert "Output files" in out
107 changes: 107 additions & 0 deletions batch_parse_table_v1beta2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# [START documentai_batch_parse_table_beta]
import re

from google.cloud import documentai_v1beta2 as documentai
from google.cloud import storage


def batch_parse_table(
project_id="YOUR_PROJECT_ID",
input_uri="gs://cloud-samples-data/documentai/form.pdf",
destination_uri="gs://your-bucket-id/path/to/save/results/",
):
"""Parse a form"""

client = documentai.DocumentUnderstandingServiceClient()

gcs_source = documentai.types.GcsSource(uri=input_uri)

# mime_type can be application/pdf, image/tiff,
# and image/gif, or application/json
input_config = documentai.types.InputConfig(
gcs_source=gcs_source, mime_type="application/pdf"
)

# where to write results
output_config = documentai.types.OutputConfig(
gcs_destination=documentai.types.GcsDestination(uri=destination_uri),
pages_per_shard=1, # Map one doc page to one output page
)

# Improve table parsing results by providing bounding boxes
# specifying where the box appears in the document (optional)
table_bound_hints = [
documentai.types.TableBoundHint(
page_number=1,
bounding_box=documentai.types.BoundingPoly(
# Define a polygon around tables to detect
# Each vertice coordinate must be a number between 0 and 1
normalized_vertices=[
# Top left
documentai.types.geometry.NormalizedVertex(x=0, y=0),
# Top right
documentai.types.geometry.NormalizedVertex(x=1, y=0),
# Bottom right
documentai.types.geometry.NormalizedVertex(x=1, y=1),
# Bottom left
documentai.types.geometry.NormalizedVertex(x=0, y=1),
]
),
)
]

# Setting enabled=True enables form extraction
table_extraction_params = documentai.types.TableExtractionParams(
enabled=True, table_bound_hints=table_bound_hints
)

# Location can be 'us' or 'eu'
parent = "projects/{}/locations/us".format(project_id)
request = documentai.types.ProcessDocumentRequest(
input_config=input_config,
output_config=output_config,
table_extraction_params=table_extraction_params,
)

requests = []
requests.append(request)

batch_request = documentai.types.BatchProcessDocumentsRequest(
parent=parent, requests=requests
)

operation = client.batch_process_documents(batch_request)

# Wait for the operation to finish
operation.result()

# Results are written to GCS. Use a regex to find
# output files
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
output_bucket = match.group(1)
prefix = match.group(2)

storage_client = storage.client.Client()
bucket = storage_client.get_bucket(output_bucket)
blob_list = list(bucket.list_blobs(prefix=prefix))
print("Output files:")
for blob in blob_list:
print(blob.name)


# [END documentai_batch_parse_table_beta]
46 changes: 46 additions & 0 deletions batch_parse_table_v1beta2_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific ladnguage governing permissions and
# limitations under the License.

import os
import uuid

from google.cloud import storage

import pytest

from samples.snippets import batch_parse_table_v1beta2


BUCKET = "document-ai-{}".format(uuid.uuid4())
OUTPUT_PREFIX = "TEST_OUTPUT_{}".format(uuid.uuid4())
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf"
BATCH_OUTPUT_URI = "gs://{}/{}/".format(BUCKET, OUTPUT_PREFIX)


@pytest.fixture(autouse=True)
def setup_teardown():
"""Create a temporary bucket to store annotation output."""
storage_client = storage.Client()
bucket = storage_client.create_bucket(BUCKET)

yield

bucket.delete(force=True)


def test_batch_parse_table(capsys):
batch_parse_table_v1beta2.batch_parse_table(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
out, _ = capsys.readouterr()
assert "Output files:" in out
Loading

0 comments on commit 7b2f8c9

Please sign in to comment.