From d923e5348983ebe0881b96dcfdb687696b5eb5df Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Wed, 14 Dec 2022 13:24:34 -0800 Subject: [PATCH] feat: added sharding_config field in DocumentOutputConfig.GcsOutputConfig in document_io.proto (#433) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: added sharding_config field in DocumentOutputConfig.GcsOutputConfig in document_io.proto feat: added process_options field in ProcessRequest in document_processor_service.proto feat: added sample_document_uris field in ProcessorType in processor_type.proto PiperOrigin-RevId: 495360288 Source-Link: https://github.com/googleapis/googleapis/commit/5f39f4653ed9b257db3f4e19bb3f9178f56b294b Source-Link: https://github.com/googleapis/googleapis-gen/commit/8520d57093d360afec194cd029b71324e597f626 Copy-Tag: eyJwIjoiLmdpdGh1Yi8uT3dsQm90LnlhbWwiLCJoIjoiODUyMGQ1NzA5M2QzNjBhZmVjMTk0Y2QwMjliNzEzMjRlNTk3ZjYyNiJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: added sharding_config field in DocumentOutputConfig.GcsOutputConfig in document_io.proto feat: added process_options field in ProcessRequest in document_processor_service.proto feat: added sample_document_uris field in ProcessorType in processor_type.proto PiperOrigin-RevId: 495363748 Source-Link: https://github.com/googleapis/googleapis/commit/3dce9ff167afafeddda96de98192af131b313ffc Source-Link: https://github.com/googleapis/googleapis-gen/commit/66f3518217f0c04a47faefeb3322ff226106b857 Copy-Tag: eyJwIjoiLmdpdGh1Yi8uT3dsQm90LnlhbWwiLCJoIjoiNjZmMzUxODIxN2YwYzA0YTQ3ZmFlZmViMzMyMmZmMjI2MTA2Yjg1NyJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md Co-authored-by: Owl Bot --- google/cloud/documentai_v1/types/document.py | 19 ++++++------ .../documentai_v1/types/document_schema.py | 2 +- .../documentai_v1/types/processor_type.py | 7 +++++ google/cloud/documentai_v1beta3/__init__.py | 4 +++ .../documentai_v1beta3/types/__init__.py | 4 +++ .../documentai_v1beta3/types/document.py | 19 ++++++------ .../documentai_v1beta3/types/document_io.py | 17 ++++++++++ .../types/document_processor_service.py | 31 +++++++++++++++++++ .../types/document_schema.py | 2 +- .../types/processor_type.py | 7 +++++ ...t_metadata_google.cloud.documentai.v1.json | 2 +- ...adata_google.cloud.documentai.v1beta2.json | 2 +- ...adata_google.cloud.documentai.v1beta3.json | 2 +- scripts/fixup_documentai_v1beta3_keywords.py | 4 +-- 14 files changed, 97 insertions(+), 25 deletions(-) diff --git a/google/cloud/documentai_v1/types/document.py b/google/cloud/documentai_v1/types/document.py index df7bf303..6e1090aa 100644 --- a/google/cloud/documentai_v1/types/document.py +++ b/google/cloud/documentai_v1/types/document.py @@ -72,7 +72,7 @@ class Document(proto.Message): Optional. UTF-8 encoded text in reading order from the document. text_styles (MutableSequence[google.cloud.documentai_v1.types.Document.Style]): - Placeholder. Styles for the + Styles for the [Document.text][google.cloud.documentai.v1.Document.text]. pages (MutableSequence[google.cloud.documentai_v1.types.Document.Page]): Visual page layout for the @@ -775,9 +775,9 @@ class FormField(proto.Message): If the value is non-textual, this field represents the type. Current valid values are: - - blank (this indicates the field_value is normal text) - - "unfilled_checkbox" - - "filled_checkbox". + - blank (this indicates the ``field_value`` is normal text) + - ``unfilled_checkbox`` + - ``filled_checkbox`` corrected_key_text (str): Created for Labeling UI to export key text. If corrections were made to the text identified by the @@ -1117,8 +1117,8 @@ class NormalizedValue(proto.Message): For some entity types, one of respective ``structured_value`` fields may also be populated. Also not all the types of ``structured_value`` will be normalized. - For example, some processors may not generate float or int - normalized text by default. + For example, some processors may not generate ``float`` or + ``integer`` normalized text by default. Below are sample formats mapped to structured values. @@ -1323,9 +1323,10 @@ class PageRef(proto.Message): page (int): Required. Index into the [Document.pages][google.cloud.documentai.v1.Document.pages] - element, for example using [Document.pages][page_refs.page] - to locate the related page element. This field is skipped - when its value is the default 0. See + element, for example using + ``[Document.pages][page_refs.page]`` to locate the related + page element. This field is skipped when its value is the + default ``0``. See https://developers.google.com/protocol-buffers/docs/proto3#json. layout_type (google.cloud.documentai_v1.types.Document.PageAnchor.PageRef.LayoutType): Optional. The type of the layout element that diff --git a/google/cloud/documentai_v1/types/document_schema.py b/google/cloud/documentai_v1/types/document_schema.py index c8716beb..f81539dc 100644 --- a/google/cloud/documentai_v1/types/document_schema.py +++ b/google/cloud/documentai_v1/types/document_schema.py @@ -66,7 +66,7 @@ class EntityType(proto.Message): following naming conventions: - *use ``snake_casing``* - - name matching is case-insensitive + - name matching is case-sensitive - Maximum 64 characters. - Must start with a letter. - Allowed characters: ASCII letters ``[a-z0-9_-]``. (For diff --git a/google/cloud/documentai_v1/types/processor_type.py b/google/cloud/documentai_v1/types/processor_type.py index 8a99ef5c..6eea5590 100644 --- a/google/cloud/documentai_v1/types/processor_type.py +++ b/google/cloud/documentai_v1/types/processor_type.py @@ -50,6 +50,9 @@ class ProcessorType(proto.Message): access. launch_stage (google.api.launch_stage_pb2.LaunchStage): Launch stage of the processor type + sample_document_uris (MutableSequence[str]): + A set of Cloud Storage URIs of sample + documents for this processor. """ class LocationInfo(proto.Message): @@ -92,6 +95,10 @@ class LocationInfo(proto.Message): number=8, enum=launch_stage_pb2.LaunchStage, ) + sample_document_uris: MutableSequence[str] = proto.RepeatedField( + proto.STRING, + number=9, + ) __all__ = tuple(sorted(__protobuf__.manifest)) diff --git a/google/cloud/documentai_v1beta3/__init__.py b/google/cloud/documentai_v1beta3/__init__.py index 5065778e..ca5d1d24 100644 --- a/google/cloud/documentai_v1beta3/__init__.py +++ b/google/cloud/documentai_v1beta3/__init__.py @@ -30,6 +30,7 @@ GcsDocument, GcsDocuments, GcsPrefix, + OcrConfig, RawDocument, ) from .types.document_processor_service import ( @@ -67,6 +68,7 @@ ListProcessorTypesResponse, ListProcessorVersionsRequest, ListProcessorVersionsResponse, + ProcessOptions, ProcessRequest, ProcessResponse, ReviewDocumentOperationMetadata, @@ -138,6 +140,8 @@ "ListProcessorsRequest", "ListProcessorsResponse", "NormalizedVertex", + "OcrConfig", + "ProcessOptions", "ProcessRequest", "ProcessResponse", "Processor", diff --git a/google/cloud/documentai_v1beta3/types/__init__.py b/google/cloud/documentai_v1beta3/types/__init__.py index b8a711e7..21e68d43 100644 --- a/google/cloud/documentai_v1beta3/types/__init__.py +++ b/google/cloud/documentai_v1beta3/types/__init__.py @@ -21,6 +21,7 @@ GcsDocument, GcsDocuments, GcsPrefix, + OcrConfig, RawDocument, ) from .document_processor_service import ( @@ -58,6 +59,7 @@ ListProcessorTypesResponse, ListProcessorVersionsRequest, ListProcessorVersionsResponse, + ProcessOptions, ProcessRequest, ProcessResponse, ReviewDocumentOperationMetadata, @@ -88,6 +90,7 @@ "GcsDocument", "GcsDocuments", "GcsPrefix", + "OcrConfig", "RawDocument", "BatchProcessMetadata", "BatchProcessRequest", @@ -123,6 +126,7 @@ "ListProcessorTypesResponse", "ListProcessorVersionsRequest", "ListProcessorVersionsResponse", + "ProcessOptions", "ProcessRequest", "ProcessResponse", "ReviewDocumentOperationMetadata", diff --git a/google/cloud/documentai_v1beta3/types/document.py b/google/cloud/documentai_v1beta3/types/document.py index e7ec9877..7525315b 100644 --- a/google/cloud/documentai_v1beta3/types/document.py +++ b/google/cloud/documentai_v1beta3/types/document.py @@ -72,7 +72,7 @@ class Document(proto.Message): Optional. UTF-8 encoded text in reading order from the document. text_styles (MutableSequence[google.cloud.documentai_v1beta3.types.Document.Style]): - Placeholder. Styles for the + Styles for the [Document.text][google.cloud.documentai.v1beta3.Document.text]. pages (MutableSequence[google.cloud.documentai_v1beta3.types.Document.Page]): Visual page layout for the @@ -779,9 +779,9 @@ class FormField(proto.Message): If the value is non-textual, this field represents the type. Current valid values are: - - blank (this indicates the field_value is normal text) - - "unfilled_checkbox" - - "filled_checkbox". + - blank (this indicates the ``field_value`` is normal text) + - ``unfilled_checkbox`` + - ``filled_checkbox`` corrected_key_text (str): Created for Labeling UI to export key text. If corrections were made to the text identified by the @@ -1121,8 +1121,8 @@ class NormalizedValue(proto.Message): For some entity types, one of respective ``structured_value`` fields may also be populated. Also not all the types of ``structured_value`` will be normalized. - For example, some processors may not generate float or int - normalized text by default. + For example, some processors may not generate ``float`` or + ``integer`` normalized text by default. Below are sample formats mapped to structured values. @@ -1327,9 +1327,10 @@ class PageRef(proto.Message): page (int): Required. Index into the [Document.pages][google.cloud.documentai.v1beta3.Document.pages] - element, for example using [Document.pages][page_refs.page] - to locate the related page element. This field is skipped - when its value is the default 0. See + element, for example using + ``[Document.pages][page_refs.page]`` to locate the related + page element. This field is skipped when its value is the + default ``0``. See https://developers.google.com/protocol-buffers/docs/proto3#json. layout_type (google.cloud.documentai_v1beta3.types.Document.PageAnchor.PageRef.LayoutType): Optional. The type of the layout element that diff --git a/google/cloud/documentai_v1beta3/types/document_io.py b/google/cloud/documentai_v1beta3/types/document_io.py index a107e32d..7d0596c2 100644 --- a/google/cloud/documentai_v1beta3/types/document_io.py +++ b/google/cloud/documentai_v1beta3/types/document_io.py @@ -27,6 +27,7 @@ "GcsPrefix", "BatchDocumentsInputConfig", "DocumentOutputConfig", + "OcrConfig", }, ) @@ -218,4 +219,20 @@ class ShardingConfig(proto.Message): ) +class OcrConfig(proto.Message): + r"""Config for Document OCR. + + Attributes: + enable_native_pdf_parsing (bool): + Enables special handling for PDFs with + existing text information. Results in better + text extraction quality in such PDF inputs. + """ + + enable_native_pdf_parsing: bool = proto.Field( + proto.BOOL, + number=3, + ) + + __all__ = tuple(sorted(__protobuf__.manifest)) diff --git a/google/cloud/documentai_v1beta3/types/document_processor_service.py b/google/cloud/documentai_v1beta3/types/document_processor_service.py index f7d74b57..32252d98 100644 --- a/google/cloud/documentai_v1beta3/types/document_processor_service.py +++ b/google/cloud/documentai_v1beta3/types/document_processor_service.py @@ -31,6 +31,7 @@ __protobuf__ = proto.module( package="google.cloud.documentai.v1beta3", manifest={ + "ProcessOptions", "ProcessRequest", "HumanReviewStatus", "ProcessResponse", @@ -83,6 +84,22 @@ ) +class ProcessOptions(proto.Message): + r"""Options for Process API + + Attributes: + ocr_config (google.cloud.documentai_v1beta3.types.OcrConfig): + Only applicable to "Document OCR Processor". + Returns error if set on other processor types. + """ + + ocr_config: document_io.OcrConfig = proto.Field( + proto.MESSAGE, + number=1, + message=document_io.OcrConfig, + ) + + class ProcessRequest(proto.Message): r"""Request message for the process document method. @@ -125,6 +142,8 @@ class ProcessRequest(proto.Message): document. Only supports top level document and pages field so it must be in the form of ``{document_field_name}`` or ``pages.{page_field_name}``. + process_options (google.cloud.documentai_v1beta3.types.ProcessOptions): + Inference-time options for the process API """ inline_document: gcd_document.Document = proto.Field( @@ -157,6 +176,11 @@ class ProcessRequest(proto.Message): number=6, message=field_mask_pb2.FieldMask, ) + process_options: "ProcessOptions" = proto.Field( + proto.MESSAGE, + number=7, + message="ProcessOptions", + ) class HumanReviewStatus(proto.Message): @@ -259,6 +283,8 @@ class BatchProcessRequest(proto.Message): skip_human_review (bool): Whether Human Review feature should be skipped for this request. Default to false. + process_options (google.cloud.documentai_v1beta3.types.ProcessOptions): + Inference-time options for the process API """ class BatchInputConfig(proto.Message): @@ -326,6 +352,11 @@ class BatchOutputConfig(proto.Message): proto.BOOL, number=4, ) + process_options: "ProcessOptions" = proto.Field( + proto.MESSAGE, + number=7, + message="ProcessOptions", + ) class BatchProcessResponse(proto.Message): diff --git a/google/cloud/documentai_v1beta3/types/document_schema.py b/google/cloud/documentai_v1beta3/types/document_schema.py index c2d47c68..0092fcba 100644 --- a/google/cloud/documentai_v1beta3/types/document_schema.py +++ b/google/cloud/documentai_v1beta3/types/document_schema.py @@ -66,7 +66,7 @@ class EntityType(proto.Message): following naming conventions: - *use ``snake_casing``* - - name matching is case-insensitive + - name matching is case-sensitive - Maximum 64 characters. - Must start with a letter. - Allowed characters: ASCII letters ``[a-z0-9_-]``. (For diff --git a/google/cloud/documentai_v1beta3/types/processor_type.py b/google/cloud/documentai_v1beta3/types/processor_type.py index 7f62cdb7..22c8e84e 100644 --- a/google/cloud/documentai_v1beta3/types/processor_type.py +++ b/google/cloud/documentai_v1beta3/types/processor_type.py @@ -50,6 +50,9 @@ class ProcessorType(proto.Message): access. launch_stage (google.api.launch_stage_pb2.LaunchStage): Launch stage of the processor type + sample_document_uris (MutableSequence[str]): + A set of Cloud Storage URIs of sample + documents for this processor. """ class LocationInfo(proto.Message): @@ -92,6 +95,10 @@ class LocationInfo(proto.Message): number=8, enum=launch_stage_pb2.LaunchStage, ) + sample_document_uris: MutableSequence[str] = proto.RepeatedField( + proto.STRING, + number=9, + ) __all__ = tuple(sorted(__protobuf__.manifest)) diff --git a/samples/generated_samples/snippet_metadata_google.cloud.documentai.v1.json b/samples/generated_samples/snippet_metadata_google.cloud.documentai.v1.json index 5b304c71..5064960d 100644 --- a/samples/generated_samples/snippet_metadata_google.cloud.documentai.v1.json +++ b/samples/generated_samples/snippet_metadata_google.cloud.documentai.v1.json @@ -8,7 +8,7 @@ ], "language": "PYTHON", "name": "google-cloud-documentai", - "version": "2.5.0" + "version": "0.1.0" }, "snippets": [ { diff --git a/samples/generated_samples/snippet_metadata_google.cloud.documentai.v1beta2.json b/samples/generated_samples/snippet_metadata_google.cloud.documentai.v1beta2.json index a97b55f3..ef56bd7e 100644 --- a/samples/generated_samples/snippet_metadata_google.cloud.documentai.v1beta2.json +++ b/samples/generated_samples/snippet_metadata_google.cloud.documentai.v1beta2.json @@ -8,7 +8,7 @@ ], "language": "PYTHON", "name": "google-cloud-documentai", - "version": "2.5.0" + "version": "0.1.0" }, "snippets": [ { diff --git a/samples/generated_samples/snippet_metadata_google.cloud.documentai.v1beta3.json b/samples/generated_samples/snippet_metadata_google.cloud.documentai.v1beta3.json index d9c8dc86..e88607d5 100644 --- a/samples/generated_samples/snippet_metadata_google.cloud.documentai.v1beta3.json +++ b/samples/generated_samples/snippet_metadata_google.cloud.documentai.v1beta3.json @@ -8,7 +8,7 @@ ], "language": "PYTHON", "name": "google-cloud-documentai", - "version": "2.5.0" + "version": "0.1.0" }, "snippets": [ { diff --git a/scripts/fixup_documentai_v1beta3_keywords.py b/scripts/fixup_documentai_v1beta3_keywords.py index 31eecd92..0cab2650 100644 --- a/scripts/fixup_documentai_v1beta3_keywords.py +++ b/scripts/fixup_documentai_v1beta3_keywords.py @@ -39,7 +39,7 @@ def partition( class documentaiCallTransformer(cst.CSTTransformer): CTRL_PARAMS: Tuple[str] = ('retry', 'timeout', 'metadata') METHOD_TO_PARAMS: Dict[str, Tuple[str]] = { - 'batch_process_documents': ('name', 'input_configs', 'output_config', 'input_documents', 'document_output_config', 'skip_human_review', ), + 'batch_process_documents': ('name', 'input_configs', 'output_config', 'input_documents', 'document_output_config', 'skip_human_review', 'process_options', ), 'create_processor': ('parent', 'processor', ), 'delete_processor': ('name', ), 'delete_processor_version': ('name', ), @@ -55,7 +55,7 @@ class documentaiCallTransformer(cst.CSTTransformer): 'list_processors': ('parent', 'page_size', 'page_token', ), 'list_processor_types': ('parent', 'page_size', 'page_token', ), 'list_processor_versions': ('parent', 'page_size', 'page_token', ), - 'process_document': ('name', 'inline_document', 'raw_document', 'document', 'skip_human_review', 'field_mask', ), + 'process_document': ('name', 'inline_document', 'raw_document', 'document', 'skip_human_review', 'field_mask', 'process_options', ), 'review_document': ('human_review_config', 'inline_document', 'document', 'enable_schema_validation', 'priority', 'document_schema', ), 'set_default_processor_version': ('processor', 'default_processor_version', ), 'train_processor_version': ('parent', 'processor_version', 'document_schema', 'input_data', 'base_processor_version', ),