From f008893d15b88ccd29c23d3c49fee6c1a5c1e1d9 Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Tue, 2 Apr 2024 20:45:46 -0400 Subject: [PATCH] feat: [google-cloud-documentai] Support a new Layout Processor in Document AI (#12541) BEGIN_COMMIT_OVERRIDE feat: Support a new Layout Processor in Document AI docs: keep the API doc up-to-date with recent changes END_COMMIT_OVERRIDE - [ ] Regenerate this pull request now. docs: keep the API doc up-to-date with recent changes PiperOrigin-RevId: 621233157 Source-Link: https://github.com/googleapis/googleapis/commit/d5020fff4cbe108bdf506074791c56cff7840bef Source-Link: https://github.com/googleapis/googleapis-gen/commit/3beacfd02f8cf650bbae6ef8c37131c98723fa17 Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLWRvY3VtZW50YWkvLk93bEJvdC55YW1sIiwiaCI6IjNiZWFjZmQwMmY4Y2Y2NTBiYmFlNmVmOGMzNzEzMWM5ODcyM2ZhMTcifQ== --------- Co-authored-by: Owl Bot Co-authored-by: Anthonios Partheniou Co-authored-by: ohmayr --- .../documentai_v1beta3/types/document.py | 325 ++++++++++++++++++ .../types/document_processor_service.py | 60 ++++ .../types/document_service.py | 4 +- .../documentai_v1beta3/types/processor.py | 6 +- 4 files changed, 390 insertions(+), 5 deletions(-) diff --git a/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document.py b/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document.py index 4aa3887f31df..13ba67cff9de 100644 --- a/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document.py +++ b/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document.py @@ -104,6 +104,10 @@ class Document(proto.Message): revisions (MutableSequence[google.cloud.documentai_v1beta3.types.Document.Revision]): Placeholder. Revision history of this document. + document_layout (google.cloud.documentai_v1beta3.types.Document.DocumentLayout): + Parsed layout of the document. + chunked_document (google.cloud.documentai_v1beta3.types.Document.ChunkedDocument): + Document chunked based on chunking config. """ class ShardInfo(proto.Message): @@ -1811,6 +1815,317 @@ class TextChange(proto.Message): message="Document.Provenance", ) + class DocumentLayout(proto.Message): + r"""Represents the parsed layout of a document as a collection of + blocks that the document is divided into. + + Attributes: + blocks (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock]): + List of blocks in the document. + """ + + class DocumentLayoutBlock(proto.Message): + r"""Represents a block. A block could be one of the various types + (text, table, list) supported. + + This message has `oneof`_ fields (mutually exclusive fields). + For each oneof, at most one member field can be set at the same time. + Setting any member of the oneof automatically clears all other + members. + + .. _oneof: https://proto-plus-python.readthedocs.io/en/stable/fields.html#oneofs-mutually-exclusive-fields + + Attributes: + text_block (google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutTextBlock): + Block consisting of text content. + + This field is a member of `oneof`_ ``block``. + table_block (google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutTableBlock): + Block consisting of table content/structure. + + This field is a member of `oneof`_ ``block``. + list_block (google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutListBlock): + Block consisting of list content/structure. + + This field is a member of `oneof`_ ``block``. + block_id (str): + ID of the block. + page_span (google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutPageSpan): + Page span of the block. + """ + + class LayoutPageSpan(proto.Message): + r"""Represents where the block starts and ends in the document. + + Attributes: + page_start (int): + Page where block starts in the document. + page_end (int): + Page where block ends in the document. + """ + + page_start: int = proto.Field( + proto.INT32, + number=1, + ) + page_end: int = proto.Field( + proto.INT32, + number=2, + ) + + class LayoutTextBlock(proto.Message): + r"""Represents a text type block. + + Attributes: + text (str): + Text content stored in the block. + type_ (str): + Type of the text in the block. Available options are: + ``paragraph``, ``subtitle``, ``heading-1``, ``heading-2``, + ``heading-3``, ``heading-4``, ``heading-5``, ``header``, + ``footer``. + blocks (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock]): + A text block could further have child blocks. + Repeated blocks support further hierarchies and + nested blocks. + """ + + text: str = proto.Field( + proto.STRING, + number=1, + ) + type_: str = proto.Field( + proto.STRING, + number=2, + ) + blocks: MutableSequence[ + "Document.DocumentLayout.DocumentLayoutBlock" + ] = proto.RepeatedField( + proto.MESSAGE, + number=3, + message="Document.DocumentLayout.DocumentLayoutBlock", + ) + + class LayoutTableBlock(proto.Message): + r"""Represents a table type block. + + Attributes: + header_rows (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutTableRow]): + Header rows at the top of the table. + body_rows (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutTableRow]): + Body rows containing main table content. + caption (str): + Table caption/title. + """ + + header_rows: MutableSequence[ + "Document.DocumentLayout.DocumentLayoutBlock.LayoutTableRow" + ] = proto.RepeatedField( + proto.MESSAGE, + number=1, + message="Document.DocumentLayout.DocumentLayoutBlock.LayoutTableRow", + ) + body_rows: MutableSequence[ + "Document.DocumentLayout.DocumentLayoutBlock.LayoutTableRow" + ] = proto.RepeatedField( + proto.MESSAGE, + number=2, + message="Document.DocumentLayout.DocumentLayoutBlock.LayoutTableRow", + ) + caption: str = proto.Field( + proto.STRING, + number=3, + ) + + class LayoutTableRow(proto.Message): + r"""Represents a row in a table. + + Attributes: + cells (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutTableCell]): + A table row is a list of table cells. + """ + + cells: MutableSequence[ + "Document.DocumentLayout.DocumentLayoutBlock.LayoutTableCell" + ] = proto.RepeatedField( + proto.MESSAGE, + number=1, + message="Document.DocumentLayout.DocumentLayoutBlock.LayoutTableCell", + ) + + class LayoutTableCell(proto.Message): + r"""Represents a cell in a table row. + + Attributes: + blocks (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock]): + A table cell is a list of blocks. + Repeated blocks support further hierarchies and + nested blocks. + row_span (int): + How many rows this cell spans. + col_span (int): + How many columns this cell spans. + """ + + blocks: MutableSequence[ + "Document.DocumentLayout.DocumentLayoutBlock" + ] = proto.RepeatedField( + proto.MESSAGE, + number=1, + message="Document.DocumentLayout.DocumentLayoutBlock", + ) + row_span: int = proto.Field( + proto.INT32, + number=2, + ) + col_span: int = proto.Field( + proto.INT32, + number=3, + ) + + class LayoutListBlock(proto.Message): + r"""Represents a list type block. + + Attributes: + list_entries (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutListEntry]): + List entries that constitute a list block. + type_ (str): + Type of the list_entries (if exist). Available options are + ``ordered`` and ``unordered``. + """ + + list_entries: MutableSequence[ + "Document.DocumentLayout.DocumentLayoutBlock.LayoutListEntry" + ] = proto.RepeatedField( + proto.MESSAGE, + number=1, + message="Document.DocumentLayout.DocumentLayoutBlock.LayoutListEntry", + ) + type_: str = proto.Field( + proto.STRING, + number=2, + ) + + class LayoutListEntry(proto.Message): + r"""Represents an entry in the list. + + Attributes: + blocks (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock]): + A list entry is a list of blocks. + Repeated blocks support further hierarchies and + nested blocks. + """ + + blocks: MutableSequence[ + "Document.DocumentLayout.DocumentLayoutBlock" + ] = proto.RepeatedField( + proto.MESSAGE, + number=1, + message="Document.DocumentLayout.DocumentLayoutBlock", + ) + + text_block: "Document.DocumentLayout.DocumentLayoutBlock.LayoutTextBlock" = proto.Field( + proto.MESSAGE, + number=2, + oneof="block", + message="Document.DocumentLayout.DocumentLayoutBlock.LayoutTextBlock", + ) + table_block: "Document.DocumentLayout.DocumentLayoutBlock.LayoutTableBlock" = proto.Field( + proto.MESSAGE, + number=3, + oneof="block", + message="Document.DocumentLayout.DocumentLayoutBlock.LayoutTableBlock", + ) + list_block: "Document.DocumentLayout.DocumentLayoutBlock.LayoutListBlock" = proto.Field( + proto.MESSAGE, + number=4, + oneof="block", + message="Document.DocumentLayout.DocumentLayoutBlock.LayoutListBlock", + ) + block_id: str = proto.Field( + proto.STRING, + number=1, + ) + page_span: "Document.DocumentLayout.DocumentLayoutBlock.LayoutPageSpan" = proto.Field( + proto.MESSAGE, + number=5, + message="Document.DocumentLayout.DocumentLayoutBlock.LayoutPageSpan", + ) + + blocks: MutableSequence[ + "Document.DocumentLayout.DocumentLayoutBlock" + ] = proto.RepeatedField( + proto.MESSAGE, + number=1, + message="Document.DocumentLayout.DocumentLayoutBlock", + ) + + class ChunkedDocument(proto.Message): + r"""Represents the chunks that the document is divided into. + + Attributes: + chunks (MutableSequence[google.cloud.documentai_v1beta3.types.Document.ChunkedDocument.Chunk]): + List of chunks. + """ + + class Chunk(proto.Message): + r"""Represents a chunk. + + Attributes: + chunk_id (str): + ID of the chunk. + source_block_ids (MutableSequence[str]): + List of all parsed documents layout source + blocks used to generate the chunk. + content (str): + Text content of the chunk. + page_span (google.cloud.documentai_v1beta3.types.Document.ChunkedDocument.Chunk.ChunkPageSpan): + Page span of the chunk. + """ + + class ChunkPageSpan(proto.Message): + r"""Represents where the chunk starts and ends in the document. + + Attributes: + page_start (int): + Page where chunk starts in the document. + page_end (int): + Page where chunk ends in the document. + """ + + page_start: int = proto.Field( + proto.INT32, + number=1, + ) + page_end: int = proto.Field( + proto.INT32, + number=2, + ) + + chunk_id: str = proto.Field( + proto.STRING, + number=1, + ) + source_block_ids: MutableSequence[str] = proto.RepeatedField( + proto.STRING, + number=2, + ) + content: str = proto.Field( + proto.STRING, + number=3, + ) + page_span: "Document.ChunkedDocument.Chunk.ChunkPageSpan" = proto.Field( + proto.MESSAGE, + number=4, + message="Document.ChunkedDocument.Chunk.ChunkPageSpan", + ) + + chunks: MutableSequence["Document.ChunkedDocument.Chunk"] = proto.RepeatedField( + proto.MESSAGE, + number=1, + message="Document.ChunkedDocument.Chunk", + ) + uri: str = proto.Field( proto.STRING, number=1, @@ -1869,6 +2184,16 @@ class TextChange(proto.Message): number=13, message=Revision, ) + document_layout: DocumentLayout = proto.Field( + proto.MESSAGE, + number=17, + message=DocumentLayout, + ) + chunked_document: ChunkedDocument = proto.Field( + proto.MESSAGE, + number=18, + message=ChunkedDocument, + ) class RevisionRef(proto.Message): diff --git a/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document_processor_service.py b/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document_processor_service.py index fcdd8fb641f0..d46fe1cca39e 100644 --- a/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document_processor_service.py +++ b/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document_processor_service.py @@ -119,6 +119,9 @@ class ProcessOptions(proto.Message): Only applicable to ``OCR_PROCESSOR`` and ``FORM_PARSER_PROCESSOR``. Returns error if set on other processor types. + layout_config (google.cloud.documentai_v1beta3.types.ProcessOptions.LayoutConfig): + Optional. Only applicable to ``LAYOUT_PARSER_PROCESSOR``. + Returns error if set on other processor types. schema_override (google.cloud.documentai_v1beta3.types.DocumentSchema): Optional. Override the schema of the [ProcessorVersion][google.cloud.documentai.v1beta3.ProcessorVersion]. @@ -128,6 +131,58 @@ class ProcessOptions(proto.Message): doesn't support schema override. """ + class LayoutConfig(proto.Message): + r"""Serving config for layout parser processor. + + Attributes: + chunking_config (google.cloud.documentai_v1beta3.types.ProcessOptions.LayoutConfig.ChunkingConfig): + Optional. Config for chunking in layout + parser processor. + """ + + class ChunkingConfig(proto.Message): + r"""Serving config for chunking. + + Attributes: + chunk_size (int): + Optional. The chunk sizes to use when + splitting documents, in order of level. + include_ancestor_headings (bool): + Optional. Whether or not to include ancestor + headings when splitting. + semantic_chunking_group_size (bool): + Optional. The number of tokens to group + together when evaluating semantic similarity. + breakpoint_percentile_threshold (int): + Optional. The percentile of cosine + dissimilarity that must be exceeded between a + group of tokens and the next. The smaller this + number is, the more chunks will be generated. + """ + + chunk_size: int = proto.Field( + proto.INT32, + number=1, + ) + include_ancestor_headings: bool = proto.Field( + proto.BOOL, + number=2, + ) + semantic_chunking_group_size: bool = proto.Field( + proto.BOOL, + number=3, + ) + breakpoint_percentile_threshold: int = proto.Field( + proto.INT32, + number=4, + ) + + chunking_config: "ProcessOptions.LayoutConfig.ChunkingConfig" = proto.Field( + proto.MESSAGE, + number=1, + message="ProcessOptions.LayoutConfig.ChunkingConfig", + ) + class IndividualPageSelector(proto.Message): r"""A list of individual page numbers. @@ -163,6 +218,11 @@ class IndividualPageSelector(proto.Message): number=1, message=document_io.OcrConfig, ) + layout_config: LayoutConfig = proto.Field( + proto.MESSAGE, + number=9, + message=LayoutConfig, + ) schema_override: gcd_document_schema.DocumentSchema = proto.Field( proto.MESSAGE, number=8, diff --git a/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document_service.py b/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document_service.py index 5fd1139ddcfb..86af095d60ab 100644 --- a/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document_service.py +++ b/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document_service.py @@ -396,10 +396,8 @@ class ListDocumentsRequest(proto.Message): https://google.aip.dev/160. Currently support query strings are: - ------------------------------------ - - ``SplitType=DATASET_SPLIT_TEST|DATASET_SPLIT_TRAIN|DATASET_SPLIT_UNASSIGNED`` + - ``SplitType=DATASET_SPLIT_TEST|DATASET_SPLIT_TRAIN|DATASET_SPLIT_UNASSIGNED`` - ``LabelingState=DOCUMENT_LABELED|DOCUMENT_UNLABELED|DOCUMENT_AUTO_LABELED`` - ``DisplayName=\"file_name.pdf\"`` - ``EntityType=abc/def`` diff --git a/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/processor.py b/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/processor.py index 7094f0b28ccd..6801da0b543e 100644 --- a/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/processor.py +++ b/packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/processor.py @@ -42,7 +42,8 @@ class ProcessorVersion(proto.Message): Attributes: name (str): - The resource name of the processor version. Format: + Identifier. The resource name of the processor version. + Format: ``projects/{project}/locations/{location}/processors/{processor}/processorVersions/{processor_version}`` display_name (str): The display name of the processor version. @@ -50,7 +51,8 @@ class ProcessorVersion(proto.Message): The schema of the processor version. Describes the output. state (google.cloud.documentai_v1beta3.types.ProcessorVersion.State): - The state of the processor version. + Output only. The state of the processor + version. create_time (google.protobuf.timestamp_pb2.Timestamp): The time the processor version was created. latest_evaluation (google.cloud.documentai_v1beta3.types.EvaluationReference):