Skip to content
This repository has been archived by the owner on Sep 20, 2023. It is now read-only.

feat: added sharding_config field in DocumentOutputConfig.GcsOutputConfig in document_io.proto #433

Merged
merged 4 commits into from
Dec 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions google/cloud/documentai_v1/types/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class Document(proto.Message):
Optional. UTF-8 encoded text in reading order
from the document.
text_styles (MutableSequence[google.cloud.documentai_v1.types.Document.Style]):
Placeholder. Styles for the
Styles for the
[Document.text][google.cloud.documentai.v1.Document.text].
pages (MutableSequence[google.cloud.documentai_v1.types.Document.Page]):
Visual page layout for the
Expand Down Expand Up @@ -775,9 +775,9 @@ class FormField(proto.Message):
If the value is non-textual, this field represents the type.
Current valid values are:

- blank (this indicates the field_value is normal text)
- "unfilled_checkbox"
- "filled_checkbox".
- blank (this indicates the ``field_value`` is normal text)
- ``unfilled_checkbox``
- ``filled_checkbox``
corrected_key_text (str):
Created for Labeling UI to export key text. If corrections
were made to the text identified by the
Expand Down Expand Up @@ -1117,8 +1117,8 @@ class NormalizedValue(proto.Message):
For some entity types, one of respective
``structured_value`` fields may also be populated. Also not
all the types of ``structured_value`` will be normalized.
For example, some processors may not generate float or int
normalized text by default.
For example, some processors may not generate ``float`` or
``integer`` normalized text by default.

Below are sample formats mapped to structured values.

Expand Down Expand Up @@ -1323,9 +1323,10 @@ class PageRef(proto.Message):
page (int):
Required. Index into the
[Document.pages][google.cloud.documentai.v1.Document.pages]
element, for example using [Document.pages][page_refs.page]
to locate the related page element. This field is skipped
when its value is the default 0. See
element, for example using
``[Document.pages][page_refs.page]`` to locate the related
page element. This field is skipped when its value is the
default ``0``. See
https://developers.google.com/protocol-buffers/docs/proto3#json.
layout_type (google.cloud.documentai_v1.types.Document.PageAnchor.PageRef.LayoutType):
Optional. The type of the layout element that
Expand Down
2 changes: 1 addition & 1 deletion google/cloud/documentai_v1/types/document_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class EntityType(proto.Message):
following naming conventions:

- *use ``snake_casing``*
- name matching is case-insensitive
- name matching is case-sensitive
- Maximum 64 characters.
- Must start with a letter.
- Allowed characters: ASCII letters ``[a-z0-9_-]``. (For
Expand Down
7 changes: 7 additions & 0 deletions google/cloud/documentai_v1/types/processor_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ class ProcessorType(proto.Message):
access.
launch_stage (google.api.launch_stage_pb2.LaunchStage):
Launch stage of the processor type
sample_document_uris (MutableSequence[str]):
A set of Cloud Storage URIs of sample
documents for this processor.
"""

class LocationInfo(proto.Message):
Expand Down Expand Up @@ -92,6 +95,10 @@ class LocationInfo(proto.Message):
number=8,
enum=launch_stage_pb2.LaunchStage,
)
sample_document_uris: MutableSequence[str] = proto.RepeatedField(
proto.STRING,
number=9,
)


__all__ = tuple(sorted(__protobuf__.manifest))
4 changes: 4 additions & 0 deletions google/cloud/documentai_v1beta3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
GcsDocument,
GcsDocuments,
GcsPrefix,
OcrConfig,
RawDocument,
)
from .types.document_processor_service import (
Expand Down Expand Up @@ -67,6 +68,7 @@
ListProcessorTypesResponse,
ListProcessorVersionsRequest,
ListProcessorVersionsResponse,
ProcessOptions,
ProcessRequest,
ProcessResponse,
ReviewDocumentOperationMetadata,
Expand Down Expand Up @@ -138,6 +140,8 @@
"ListProcessorsRequest",
"ListProcessorsResponse",
"NormalizedVertex",
"OcrConfig",
"ProcessOptions",
"ProcessRequest",
"ProcessResponse",
"Processor",
Expand Down
4 changes: 4 additions & 0 deletions google/cloud/documentai_v1beta3/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
GcsDocument,
GcsDocuments,
GcsPrefix,
OcrConfig,
RawDocument,
)
from .document_processor_service import (
Expand Down Expand Up @@ -58,6 +59,7 @@
ListProcessorTypesResponse,
ListProcessorVersionsRequest,
ListProcessorVersionsResponse,
ProcessOptions,
ProcessRequest,
ProcessResponse,
ReviewDocumentOperationMetadata,
Expand Down Expand Up @@ -88,6 +90,7 @@
"GcsDocument",
"GcsDocuments",
"GcsPrefix",
"OcrConfig",
"RawDocument",
"BatchProcessMetadata",
"BatchProcessRequest",
Expand Down Expand Up @@ -123,6 +126,7 @@
"ListProcessorTypesResponse",
"ListProcessorVersionsRequest",
"ListProcessorVersionsResponse",
"ProcessOptions",
"ProcessRequest",
"ProcessResponse",
"ReviewDocumentOperationMetadata",
Expand Down
19 changes: 10 additions & 9 deletions google/cloud/documentai_v1beta3/types/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class Document(proto.Message):
Optional. UTF-8 encoded text in reading order
from the document.
text_styles (MutableSequence[google.cloud.documentai_v1beta3.types.Document.Style]):
Placeholder. Styles for the
Styles for the
[Document.text][google.cloud.documentai.v1beta3.Document.text].
pages (MutableSequence[google.cloud.documentai_v1beta3.types.Document.Page]):
Visual page layout for the
Expand Down Expand Up @@ -779,9 +779,9 @@ class FormField(proto.Message):
If the value is non-textual, this field represents the type.
Current valid values are:

- blank (this indicates the field_value is normal text)
- "unfilled_checkbox"
- "filled_checkbox".
- blank (this indicates the ``field_value`` is normal text)
- ``unfilled_checkbox``
- ``filled_checkbox``
corrected_key_text (str):
Created for Labeling UI to export key text. If corrections
were made to the text identified by the
Expand Down Expand Up @@ -1121,8 +1121,8 @@ class NormalizedValue(proto.Message):
For some entity types, one of respective
``structured_value`` fields may also be populated. Also not
all the types of ``structured_value`` will be normalized.
For example, some processors may not generate float or int
normalized text by default.
For example, some processors may not generate ``float`` or
``integer`` normalized text by default.

Below are sample formats mapped to structured values.

Expand Down Expand Up @@ -1327,9 +1327,10 @@ class PageRef(proto.Message):
page (int):
Required. Index into the
[Document.pages][google.cloud.documentai.v1beta3.Document.pages]
element, for example using [Document.pages][page_refs.page]
to locate the related page element. This field is skipped
when its value is the default 0. See
element, for example using
``[Document.pages][page_refs.page]`` to locate the related
page element. This field is skipped when its value is the
default ``0``. See
https://developers.google.com/protocol-buffers/docs/proto3#json.
layout_type (google.cloud.documentai_v1beta3.types.Document.PageAnchor.PageRef.LayoutType):
Optional. The type of the layout element that
Expand Down
17 changes: 17 additions & 0 deletions google/cloud/documentai_v1beta3/types/document_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"GcsPrefix",
"BatchDocumentsInputConfig",
"DocumentOutputConfig",
"OcrConfig",
},
)

Expand Down Expand Up @@ -218,4 +219,20 @@ class ShardingConfig(proto.Message):
)


class OcrConfig(proto.Message):
r"""Config for Document OCR.

Attributes:
enable_native_pdf_parsing (bool):
Enables special handling for PDFs with
existing text information. Results in better
text extraction quality in such PDF inputs.
"""

enable_native_pdf_parsing: bool = proto.Field(
proto.BOOL,
number=3,
)


__all__ = tuple(sorted(__protobuf__.manifest))
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
__protobuf__ = proto.module(
package="google.cloud.documentai.v1beta3",
manifest={
"ProcessOptions",
"ProcessRequest",
"HumanReviewStatus",
"ProcessResponse",
Expand Down Expand Up @@ -83,6 +84,22 @@
)


class ProcessOptions(proto.Message):
r"""Options for Process API

Attributes:
ocr_config (google.cloud.documentai_v1beta3.types.OcrConfig):
Only applicable to "Document OCR Processor".
Returns error if set on other processor types.
"""

ocr_config: document_io.OcrConfig = proto.Field(
proto.MESSAGE,
number=1,
message=document_io.OcrConfig,
)


class ProcessRequest(proto.Message):
r"""Request message for the process document method.

Expand Down Expand Up @@ -125,6 +142,8 @@ class ProcessRequest(proto.Message):
document. Only supports top level document and pages field
so it must be in the form of ``{document_field_name}`` or
``pages.{page_field_name}``.
process_options (google.cloud.documentai_v1beta3.types.ProcessOptions):
Inference-time options for the process API
"""

inline_document: gcd_document.Document = proto.Field(
Expand Down Expand Up @@ -157,6 +176,11 @@ class ProcessRequest(proto.Message):
number=6,
message=field_mask_pb2.FieldMask,
)
process_options: "ProcessOptions" = proto.Field(
proto.MESSAGE,
number=7,
message="ProcessOptions",
)


class HumanReviewStatus(proto.Message):
Expand Down Expand Up @@ -259,6 +283,8 @@ class BatchProcessRequest(proto.Message):
skip_human_review (bool):
Whether Human Review feature should be
skipped for this request. Default to false.
process_options (google.cloud.documentai_v1beta3.types.ProcessOptions):
Inference-time options for the process API
"""

class BatchInputConfig(proto.Message):
Expand Down Expand Up @@ -326,6 +352,11 @@ class BatchOutputConfig(proto.Message):
proto.BOOL,
number=4,
)
process_options: "ProcessOptions" = proto.Field(
proto.MESSAGE,
number=7,
message="ProcessOptions",
)


class BatchProcessResponse(proto.Message):
Expand Down
2 changes: 1 addition & 1 deletion google/cloud/documentai_v1beta3/types/document_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class EntityType(proto.Message):
following naming conventions:

- *use ``snake_casing``*
- name matching is case-insensitive
- name matching is case-sensitive
- Maximum 64 characters.
- Must start with a letter.
- Allowed characters: ASCII letters ``[a-z0-9_-]``. (For
Expand Down
7 changes: 7 additions & 0 deletions google/cloud/documentai_v1beta3/types/processor_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ class ProcessorType(proto.Message):
access.
launch_stage (google.api.launch_stage_pb2.LaunchStage):
Launch stage of the processor type
sample_document_uris (MutableSequence[str]):
A set of Cloud Storage URIs of sample
documents for this processor.
"""

class LocationInfo(proto.Message):
Expand Down Expand Up @@ -92,6 +95,10 @@ class LocationInfo(proto.Message):
number=8,
enum=launch_stage_pb2.LaunchStage,
)
sample_document_uris: MutableSequence[str] = proto.RepeatedField(
proto.STRING,
number=9,
)


__all__ = tuple(sorted(__protobuf__.manifest))
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
],
"language": "PYTHON",
"name": "google-cloud-documentai",
"version": "2.5.0"
"version": "0.1.0"
},
"snippets": [
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
],
"language": "PYTHON",
"name": "google-cloud-documentai",
"version": "2.5.0"
"version": "0.1.0"
},
"snippets": [
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
],
"language": "PYTHON",
"name": "google-cloud-documentai",
"version": "2.5.0"
"version": "0.1.0"
},
"snippets": [
{
Expand Down
4 changes: 2 additions & 2 deletions scripts/fixup_documentai_v1beta3_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def partition(
class documentaiCallTransformer(cst.CSTTransformer):
CTRL_PARAMS: Tuple[str] = ('retry', 'timeout', 'metadata')
METHOD_TO_PARAMS: Dict[str, Tuple[str]] = {
'batch_process_documents': ('name', 'input_configs', 'output_config', 'input_documents', 'document_output_config', 'skip_human_review', ),
'batch_process_documents': ('name', 'input_configs', 'output_config', 'input_documents', 'document_output_config', 'skip_human_review', 'process_options', ),
'create_processor': ('parent', 'processor', ),
'delete_processor': ('name', ),
'delete_processor_version': ('name', ),
Expand All @@ -55,7 +55,7 @@ class documentaiCallTransformer(cst.CSTTransformer):
'list_processors': ('parent', 'page_size', 'page_token', ),
'list_processor_types': ('parent', 'page_size', 'page_token', ),
'list_processor_versions': ('parent', 'page_size', 'page_token', ),
'process_document': ('name', 'inline_document', 'raw_document', 'document', 'skip_human_review', 'field_mask', ),
'process_document': ('name', 'inline_document', 'raw_document', 'document', 'skip_human_review', 'field_mask', 'process_options', ),
'review_document': ('human_review_config', 'inline_document', 'document', 'enable_schema_validation', 'priority', 'document_schema', ),
'set_default_processor_version': ('processor', 'default_processor_version', ),
'train_processor_version': ('parent', 'processor_version', 'document_schema', 'input_data', 'base_processor_version', ),
Expand Down