diff --git a/CHANGELOG.md b/CHANGELOG.md index bef231d402..cd879e566a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.13.8-dev12 +## 0.13.8-dev13 ### Enhancements @@ -7,6 +7,7 @@ * **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy. ### Features +* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`. ### Fixes diff --git a/example-docs/test_evaluate_files/unstructured_output/form.json b/example-docs/test_evaluate_files/unstructured_output/form.json new file mode 100644 index 0000000000..468fc4468e --- /dev/null +++ b/example-docs/test_evaluate_files/unstructured_output/form.json @@ -0,0 +1,149 @@ +[ + { + "type": "FormKeysValues", + "element_id": "MOCK_FORM_ID", + "text": "", + "metadata": { + "coordinates": { + "points": [ + [ + 35.15625, + 95.556640625 + ], + [ + 710.357666015625, + 95.556640625 + ], + [ + 710.357666015625, + 887.890625 + ], + [ + 35.15625, + 887.890625 + ] + ], + "system": "PixelSpace", + "layout_width": 754, + "layout_height": 1000 + }, + "page_number": 1, + "key_value_pairs": [ + { + "key": { + "text": "MOCK KEY", + "custom_element": { + "type": "UncategorizedText", + "element_id": "MOCK_KEY_ID_1", + "text": "MOCK KEY", + "metadata": { + "coordinates": { + "points": [ + [ + 503.271484375, + 96.3897705078125 + ], + [ + 503.271484375, + 107.5164794921875 + ], + [ + 606.103515625, + 107.5164794921875 + ], + [ + 606.103515625, + 96.3897705078125 + ] + ], + "system": "PixelSpace", + "layout_width": 754, + "layout_height": 1000 + }, + "page_number": 1 + } + }, + "layout_element_id": null + }, + "value": { + "text": "MOCK VALUE", + "custom_element": { + "type": "UncategorizedText", + "element_id": "MOCK_VALUE_ID", + "text": "MOCK VALUE", + "metadata": { + "coordinates": { + "points": [ + [ + 557.568359375, + 124.8626708984375 + ], + [ + 557.568359375, + 136.6607666015625 + ], + [ + 595.556640625, + 136.6607666015625 + ], + [ + 595.556640625, + 124.8626708984375 + ] + ], + "system": "PixelSpace", + "layout_width": 754, + "layout_height": 1000 + }, + "page_number": 1 + } + }, + "layout_element_id": null + }, + "confidence": 0.0 + }, + { + "key": { + "text": "MOCK KEY 2", + "custom_element": { + "type": "UncategorizedText", + "element_id": "MOCK_KEY_ID_2", + "text": "MOCK KEY 2", + "metadata": { + "coordinates": { + "points": [ + [ + 428.52783203125, + 124.0478515625 + ], + [ + 428.52783203125, + 136.6943359375 + ], + [ + 473.81591796875, + 136.6943359375 + ], + [ + 473.81591796875, + 124.0478515625 + ] + ], + "system": "PixelSpace", + "layout_width": 754, + "layout_height": 1000 + }, + "page_number": 1 + } + }, + "layout_element_id": null + }, + "value": null, + "confidence": 0.0 + } + ], + "file_directory": "dataset/testing_data/images", + "filename": "MOCK.png" + } + } +] \ No newline at end of file diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index b176e40c0b..7b3ff7360c 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -5,13 +5,14 @@ from __future__ import annotations import copy +import io import json import pathlib from functools import partial import pytest -from test_unstructured.unit_utils import assign_hash_ids +from test_unstructured.unit_utils import assign_hash_ids, example_doc_path from unstructured.cleaners.core import clean_bullets, clean_prefix from unstructured.documents.coordinates import ( CoordinateSystem, @@ -31,6 +32,7 @@ Title, assign_and_map_hash_ids, ) +from unstructured.partition.json import partition_json @pytest.mark.parametrize("element", [Element(), Text(text=""), CheckBox()]) @@ -744,3 +746,13 @@ def test_id_to_hash_calculates(text, sequence_number, filename, page_number, exp ) assert element.id_to_hash(sequence_number) == expected_hash, "Returned ID does not match" assert element.id == expected_hash, "ID should be set" + + +def test_formskeysvalues_reads_saves(): + filename = example_doc_path("test_evaluate_files/unstructured_output/form.json") + as_read = partition_json(filename=filename) + tmp_file = io.StringIO() + json.dump([element.to_dict() for element in as_read], tmp_file) + tmp_file.seek(0) + as_read_2 = partition_json(file=tmp_file) + assert as_read == as_read_2 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a10aece8ba..9d410b5244 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.8-dev12" # pragma: no cover +__version__ = "0.13.8-dev13" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index b2cc258219..acf21f88fe 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -143,6 +143,18 @@ class Link(TypedDict): start_index: int +class FormKeyOrValue(TypedDict): + text: str + layout_element_id: Optional[str] + custom_element: Optional[Text] + + +class FormKeyValuePair(TypedDict): + key: FormKeyOrValue + value: Optional[FormKeyOrValue] + confidence: float + + class ElementMetadata: """Fully-dynamic replacement for dataclass-based ElementMetadata.""" @@ -176,6 +188,7 @@ class ElementMetadata: header_footer_type: Optional[str] # -- used in chunks only, when chunk must be split mid-text to fit window -- is_continuation: Optional[bool] + key_value_pairs: Optional[list[FormKeyValuePair]] languages: Optional[list[str]] last_modified: Optional[str] link_texts: Optional[list[str]] @@ -327,6 +340,8 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata: self.data_source = DataSourceMetadata.from_dict(field_value) elif field_name == "orig_elements": self.orig_elements = elements_from_base64_gzipped_json(field_value) + elif field_name == "key_value_pairs": + self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value) else: setattr(self, field_name, field_value) @@ -392,6 +407,8 @@ def to_dict(self) -> dict[str, Any]: meta_dict["data_source"] = self.data_source.to_dict() if self.orig_elements is not None: meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements) + if self.key_value_pairs is not None: + meta_dict["key_value_pairs"] = _kvform_pairs_to_dict(self.key_value_pairs) return meta_dict @@ -494,6 +511,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]: "text_as_html": cls.FIRST, # -- only occurs in Table -- "table_as_cells": cls.FIRST, # -- only occurs in Table -- "url": cls.FIRST, + "key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues -- } @@ -660,6 +678,7 @@ class ElementType: PAGE_FOOTER = "Page-footer" PAGE_NUMBER = "PageNumber" CODE_SNIPPET = "CodeSnippet" + FORM_KEYS_VALUES = "FormKeysValues" @classmethod def to_dict(cls): @@ -992,6 +1011,12 @@ class PageNumber(Text): category = "PageNumber" +class FormKeysValues(Text): + """An element for capturing Key-Value dicts (forms).""" + + category = "FormKeysValues" + + TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = { ElementType.TITLE: Title, ElementType.SECTION_HEADER: Title, @@ -1029,4 +1054,43 @@ class PageNumber(Text): ElementType.PAGE_BREAK: PageBreak, ElementType.CODE_SNIPPET: CodeSnippet, ElementType.PAGE_NUMBER: PageNumber, + ElementType.FORM_KEYS_VALUES: FormKeysValues, } + + +def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]: + """ + The key_value_pairs metadata field contains (in the vast majority of cases) + nested Text elements. Those need to be turned from dicts into Elements explicitly, + e.g. when partition_json is used. + """ + from unstructured.staging.base import elements_from_dicts + + # safe to overwrite - deepcopy already happened + for kv_pair in kv_pairs: + if kv_pair["key"]["custom_element"] is not None: + (kv_pair["key"]["custom_element"],) = elements_from_dicts( + [kv_pair["key"]["custom_element"]] + ) + if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None: + (kv_pair["value"]["custom_element"],) = elements_from_dicts( + [kv_pair["value"]["custom_element"]] + ) + return kv_pairs + + +def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]: + """ + The key_value_pairs metadata field contains (in the vast majority of cases) + nested Text elements. Those need to be turned from Elements to dicts recursively, + e.g. when FormKeysValues.to_dict() is used. + + """ + kv_pairs: list[dict] = copy.deepcopy(kv_pairs) + for kv_pair in kv_pairs: + if kv_pair["key"]["custom_element"] is not None: + kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict() + if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None: + kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict() + + return kv_pairs diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index 779ea470d9..8fa98db38b 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -6,9 +6,7 @@ from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import add_metadata from unstructured.partition.common import exactly_one -from unstructured.partition.lang import ( - check_language_args, -) +from unstructured.partition.lang import check_language_args from unstructured.partition.pdf import partition_pdf_or_image from unstructured.partition.utils.constants import PartitionStrategy @@ -33,6 +31,8 @@ def partition_image( extract_image_block_to_payload: bool = False, date_from_file_object: bool = False, starting_page_number: int = 1, + extract_forms: bool = False, + form_extraction_skip_tables: bool = True, **kwargs: Any, ) -> list[Element]: """Parses an image into a list of interpreted elements. @@ -90,6 +90,11 @@ def partition_image( date_from_file_object Applies only when providing file via `file` parameter. If this option is True, attempt infer last_modified metadata from bytes, otherwise set it to None. + extract_forms + Whether the form extraction logic should be run + (results in adding FormKeysValues elements to output). + form_extraction_skip_tables + Whether the form extraction logic should ignore regions designated as Tables. """ exactly_one(filename=filename, file=file) @@ -111,5 +116,7 @@ def partition_image( extract_image_block_to_payload=extract_image_block_to_payload, date_from_file_object=date_from_file_object, starting_page_number=starting_page_number, + extract_forms=extract_forms, + form_extraction_skip_tables=form_extraction_skip_tables, **kwargs, ) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index ba11c01898..1463fe87aa 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -12,13 +12,7 @@ import pdf2image import wrapt from pdfminer import psparser -from pdfminer.layout import ( - LTChar, - LTContainer, - LTImage, - LTItem, - LTTextBox, -) +from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox from pdfminer.pdftypes import PDFObjRef from pdfminer.utils import open_filename from PIL import Image as PILImage @@ -42,10 +36,7 @@ Text, process_metadata, ) -from unstructured.file_utils.filetype import ( - FileType, - add_metadata_with_filetype, -) +from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.logger import logger, trace_logger from unstructured.nlp.patterns import PARAGRAPH_PATTERN from unstructured.partition.common import ( @@ -57,10 +48,8 @@ ocr_data_to_elements, spooled_to_bytes_io_if_needed, ) -from unstructured.partition.lang import ( - check_language_args, - prepare_languages_for_tesseract, -) +from unstructured.partition.lang import check_language_args, prepare_languages_for_tesseract +from unstructured.partition.pdf_image.form_extraction import run_form_extraction from unstructured.partition.pdf_image.pdf_image_utils import ( annotate_layout_elements, check_element_types_to_extract, @@ -85,10 +74,7 @@ OCRMode, PartitionStrategy, ) -from unstructured.partition.utils.sorting import ( - coord_has_valid_points, - sort_page_elements, -) +from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements from unstructured.patches.pdfminer import parse_keyword from unstructured.utils import requires_dependencies @@ -135,6 +121,8 @@ def partition_pdf( extract_image_block_to_payload: bool = False, date_from_file_object: bool = False, starting_page_number: int = 1, + extract_forms: bool = False, + form_extraction_skip_tables: bool = True, **kwargs: Any, ) -> list[Element]: """Parses a pdf document into a list of interpreted elements. @@ -191,6 +179,11 @@ def partition_pdf( date_from_file_object Applies only when providing file via `file` parameter. If this option is True, attempt infer last_modified metadata from bytes, otherwise set it to None. + extract_forms + Whether the form extraction logic should be run + (results in adding FormKeysValues elements to output). + form_extraction_skip_tables + Whether the form extraction logic should ignore regions designated as Tables. """ exactly_one(filename=filename, file=file) @@ -212,6 +205,7 @@ def partition_pdf( extract_image_block_to_payload=extract_image_block_to_payload, date_from_file_object=date_from_file_object, starting_page_number=starting_page_number, + extract_forms=extract_forms, **kwargs, ) @@ -233,6 +227,8 @@ def partition_pdf_or_image( extract_image_block_to_payload: bool = False, date_from_file_object: bool = False, starting_page_number: int = 1, + extract_forms: bool = False, + form_extraction_skip_tables: bool = True, **kwargs: Any, ) -> list[Element]: """Parses a pdf or image document into a list of interpreted elements.""" @@ -304,6 +300,8 @@ def partition_pdf_or_image( extract_image_block_output_dir=extract_image_block_output_dir, extract_image_block_to_payload=extract_image_block_to_payload, starting_page_number=starting_page_number, + extract_forms=extract_forms, + form_extraction_skip_tables=form_extraction_skip_tables, **kwargs, ) out_elements = _process_uncategorized_text_elements(elements) @@ -390,6 +388,8 @@ def _partition_pdf_or_image_local( analysis: bool = False, analyzed_image_output_dir_path: Optional[str] = None, starting_page_number: int = 1, + extract_forms: bool = False, + form_extraction_skip_tables: bool = True, **kwargs: Any, ) -> list[Element]: """Partition using package installed locally""" @@ -398,10 +398,7 @@ def _partition_pdf_or_image_local( process_file_with_model, ) - from unstructured.partition.pdf_image.ocr import ( - process_data_with_ocr, - process_file_with_ocr, - ) + from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr from unstructured.partition.pdf_image.pdfminer_processing import ( process_data_with_pdfminer, process_file_with_pdfminer, @@ -581,6 +578,16 @@ def _partition_pdf_or_image_local( if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"): out_elements.append(cast(Element, el)) + if extract_forms: + forms = run_form_extraction( + file=file, + filename=filename, + model_name=hi_res_model_name, + elements=out_elements, + skip_table_regions=form_extraction_skip_tables, + ) + out_elements.extend(forms) + return out_elements diff --git a/unstructured/partition/pdf_image/form_extraction.py b/unstructured/partition/pdf_image/form_extraction.py new file mode 100644 index 0000000000..3e63c486de --- /dev/null +++ b/unstructured/partition/pdf_image/form_extraction.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from typing import IO + +from unstructured.documents.elements import Element, FormKeysValues + + +def run_form_extraction( + filename: str, + file: IO[bytes], + model_name: str, + elements: list[Element], + skip_table_regions: bool, +) -> list[FormKeysValues]: + raise NotImplementedError("Form extraction not yet available.") diff --git a/unstructured/staging/weaviate.py b/unstructured/staging/weaviate.py index b8bafd9c7a..469d95cfe3 100644 --- a/unstructured/staging/weaviate.py +++ b/unstructured/staging/weaviate.py @@ -17,6 +17,7 @@ class Properties(TypedDict): "links", "orig_elements", "regex_metadata", + "key_value_pairs", )