From a83648767b9e07ce7708cb91705b7bc85ddb9429 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Wed, 9 Nov 2022 13:26:00 -0600 Subject: [PATCH] docs(samples): Updated code samples for 2.1.0 release - Added Image Quality Output to Document OCR Processor - Added `field_mask` to `batch_process` samples --- ...cess_documents_processor_version_sample.py | 4 +- ...documents_processor_version_sample_test.py | 2 + .../batch_process_documents_sample.py | 4 +- .../batch_process_documents_sample_test.py | 2 + .../snippets/process_document_ocr_sample.py | 39 ++++++++++++++++--- .../process_document_ocr_sample_test.py | 2 + 6 files changed, 45 insertions(+), 8 deletions(-) diff --git a/samples/snippets/batch_process_documents_processor_version_sample.py b/samples/snippets/batch_process_documents_processor_version_sample.py index c1fab35a..1ef32afe 100644 --- a/samples/snippets/batch_process_documents_processor_version_sample.py +++ b/samples/snippets/batch_process_documents_processor_version_sample.py @@ -29,6 +29,7 @@ # input_mime_type = "application/pdf" # gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket # gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/ +# field_mask = "text,entities,pages.pageNumber" # Optional. The fields to return in the Document object. def batch_process_documents_processor_version( @@ -40,6 +41,7 @@ def batch_process_documents_processor_version( input_mime_type: str, gcs_output_bucket: str, gcs_output_uri_prefix: str, + field_mask: str = None, timeout: int = 400, ): @@ -67,7 +69,7 @@ def batch_process_documents_processor_version( destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/" gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig( - gcs_uri=destination_uri + gcs_uri=destination_uri, field_mask=field_mask ) # Where to write results diff --git a/samples/snippets/batch_process_documents_processor_version_sample_test.py b/samples/snippets/batch_process_documents_processor_version_sample_test.py index b39c8ab2..9eb33a0c 100644 --- a/samples/snippets/batch_process_documents_processor_version_sample_test.py +++ b/samples/snippets/batch_process_documents_processor_version_sample_test.py @@ -28,6 +28,7 @@ gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf" input_mime_type = "application/pdf" gcs_output_uri_prefix = uuid4() +field_mask = "text,pages.pageNumber" BUCKET_NAME = f"document-ai-python-{uuid4()}" @@ -56,6 +57,7 @@ def test_batch_process_documents_processor_version(capsys, test_bucket): input_mime_type=input_mime_type, gcs_output_bucket=f"gs://{test_bucket}", gcs_output_uri_prefix=gcs_output_uri_prefix, + field_mask=field_mask, ) out, _ = capsys.readouterr() diff --git a/samples/snippets/batch_process_documents_sample.py b/samples/snippets/batch_process_documents_sample.py index 3b2e07ea..4c9c97a7 100644 --- a/samples/snippets/batch_process_documents_sample.py +++ b/samples/snippets/batch_process_documents_sample.py @@ -28,6 +28,7 @@ # input_mime_type = "application/pdf" # gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket # gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/ +# field_mask = "text,entities,pages.pageNumber" # Optional. The fields to return in the Document object. def batch_process_documents( @@ -38,6 +39,7 @@ def batch_process_documents( input_mime_type: str, gcs_output_bucket: str, gcs_output_uri_prefix: str, + field_mask: str = None, timeout: int = 400, ): @@ -65,7 +67,7 @@ def batch_process_documents( destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/" gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig( - gcs_uri=destination_uri + gcs_uri=destination_uri, field_mask=field_mask ) # Where to write results diff --git a/samples/snippets/batch_process_documents_sample_test.py b/samples/snippets/batch_process_documents_sample_test.py index 76a224b0..5cca811b 100644 --- a/samples/snippets/batch_process_documents_sample_test.py +++ b/samples/snippets/batch_process_documents_sample_test.py @@ -27,6 +27,7 @@ gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf" input_mime_type = "application/pdf" gcs_output_uri_prefix = uuid4() +field_mask = "text,pages.pageNumber" BUCKET_NAME = f"document-ai-python-{uuid4()}" @@ -54,6 +55,7 @@ def test_batch_process_documents(capsys, test_bucket): input_mime_type=input_mime_type, gcs_output_bucket=f"gs://{test_bucket}", gcs_output_uri_prefix=gcs_output_uri_prefix, + field_mask=field_mask, ) out, _ = capsys.readouterr() diff --git a/samples/snippets/process_document_ocr_sample.py b/samples/snippets/process_document_ocr_sample.py index a48d2178..569c33b0 100644 --- a/samples/snippets/process_document_ocr_sample.py +++ b/samples/snippets/process_document_ocr_sample.py @@ -24,16 +24,22 @@ # project_id = 'YOUR_PROJECT_ID' # location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu' # processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample +# processor_version = 'rc' # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information # file_path = '/path/to/local/pdf' # mime_type = 'application/pdf' # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types def process_document_ocr_sample( - project_id: str, location: str, processor_id: str, file_path: str, mime_type: str + project_id: str, + location: str, + processor_id: str, + processor_version: str, + file_path: str, + mime_type: str, ) -> None: # Online processing request to Document AI document = process_document( - project_id, location, processor_id, file_path, mime_type + project_id, location, processor_id, processor_version, file_path, mime_type ) # For a full list of Document object attributes, please reference this page: @@ -52,19 +58,30 @@ def process_document_ocr_sample( print_lines(page.lines, text) print_tokens(page.tokens, text) + # Currently supported in version pretrained-ocr-v1.1-2022-09-12 + if page.image_quality_scores: + print_image_quality_scores(page.image_quality_scores) + def process_document( - project_id: str, location: str, processor_id: str, file_path: str, mime_type: str + project_id: str, + location: str, + processor_id: str, + processor_version: str, + file_path: str, + mime_type: str, ) -> documentai.Document: # You must set the api_endpoint if you use a location other than 'us', e.g.: opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") client = documentai.DocumentProcessorServiceClient(client_options=opts) - # The full resource name of the processor, e.g.: - # projects/project_id/locations/location/processor/processor_id + # The full resource name of the processor version + # e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id} # You must create processors before running sample code. - name = client.processor_path(project_id, location, processor_id) + name = client.processor_version_path( + project_id, location, processor_id, processor_version + ) # Read the file into memory with open(file_path, "rb") as image: @@ -133,6 +150,16 @@ def print_tokens(tokens: Sequence[documentai.Document.Page.Token], text: str) -> print(f" Last token break type: {repr(last_token_break_type)}") +def print_image_quality_scores( + image_quality_scores: documentai.Document.Page.ImageQualityScores, +) -> None: + print(f" Quality score: {image_quality_scores.quality_score:.1%}") + print(" Detected defects:") + + for detected_defect in image_quality_scores.detected_defects: + print(f" {detected_defect.type_}: {detected_defect.confidence:.1%}") + + def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str: """ Document AI identifies text in different parts of the document by their diff --git a/samples/snippets/process_document_ocr_sample_test.py b/samples/snippets/process_document_ocr_sample_test.py index d6cceb46..49228fbd 100644 --- a/samples/snippets/process_document_ocr_sample_test.py +++ b/samples/snippets/process_document_ocr_sample_test.py @@ -20,6 +20,7 @@ location = "us" project_id = os.environ["GOOGLE_CLOUD_PROJECT"] processor_id = "52a38e080c1a7296" +processor_version = "rc" file_path = "resources/handwritten_form.pdf" mime_type = "application/pdf" @@ -29,6 +30,7 @@ def test_process_documents(capsys): project_id=project_id, location=location, processor_id=processor_id, + processor_version=processor_version, file_path=file_path, mime_type=mime_type, )