From f4aaede7a693cbf23a5a7a0b9565830349364863 Mon Sep 17 00:00:00 2001 From: Jan Kanty Milczek Date: Fri, 26 Apr 2024 23:37:05 +0200 Subject: [PATCH 01/12] partial solution - implements the required interfaces --- unstructured/documents/elements.py | 50 ++++++++++++++++++++++++++++++ unstructured/partition/image.py | 6 ++++ unstructured/partition/pdf.py | 20 ++++++++++-- 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 976ab1271b..ef0a765316 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -144,6 +144,45 @@ class Link(TypedDict): start_index: int +class FormKeyOrValue(TypedDict): + text: str + layout_element_id: Optional[str] + custom_element: Optional[Text] + + +class FormKeyValuePair(TypedDict): + key: FormKeyOrValue + value: Optional[FormKeyOrValue] + confidence: float + + +def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]: + from unstructured.staging.base import elements_from_dicts + + # safe to overwrite - deepcopy already happened + for kv_pair in kv_pairs: + if kv_pair["key"]["custom_element"] is not None: + (kv_pair["key"]["custom_element"],) = elements_from_dicts( + [kv_pair["key"]["custom_element"]] + ) + if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None: + (kv_pair["value"]["custom_element"],) = elements_from_dicts( + [kv_pair["value"]["custom_element"]] + ) + return kv_pairs + + +def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]: + kv_pairs: list[dict] = copy.copy(kv_pairs) # deepcopy already happened + for kv_pair in kv_pairs: + if kv_pair["key"]["custom_element"] is not None: + kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict() + if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None: + kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict() + + return kv_pairs + + class ElementMetadata: """Fully-dynamic replacement for dataclass-based ElementMetadata.""" @@ -177,6 +216,7 @@ class ElementMetadata: header_footer_type: Optional[str] # -- used in chunks only, when chunk must be split mid-text to fit window -- is_continuation: Optional[bool] + key_value_pairs: Optional[list[FormKeyValuePair]] languages: Optional[list[str]] last_modified: Optional[str] link_texts: Optional[list[str]] @@ -328,6 +368,8 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata: self.data_source = DataSourceMetadata.from_dict(field_value) elif field_name == "orig_elements": self.orig_elements = elements_from_base64_gzipped_json(field_value) + elif field_name == "key_value_pairs": + self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value) else: setattr(self, field_name, field_value) @@ -393,6 +435,8 @@ def to_dict(self) -> dict[str, Any]: meta_dict["data_source"] = self.data_source.to_dict() if self.orig_elements is not None: meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements) + if self.key_value_pairs is not None: + meta_dict["key_value_pairs"] = _kvform_pairs_to_dict(self.key_value_pairs) return meta_dict @@ -665,6 +709,7 @@ class ElementType: PAGE_FOOTER = "Page-footer" PAGE_NUMBER = "PageNumber" CODE_SNIPPET = "CodeSnippet" + FORM_KEYS_VALUES = "FormKeysValues" @classmethod def to_dict(cls): @@ -997,6 +1042,10 @@ class PageNumber(Text): category = "PageNumber" +class FormKeysValues(Element): + category = "FormKeysValues" + + TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = { ElementType.TITLE: Title, ElementType.SECTION_HEADER: Title, @@ -1034,4 +1083,5 @@ class PageNumber(Text): ElementType.PAGE_BREAK: PageBreak, ElementType.CODE_SNIPPET: CodeSnippet, ElementType.PAGE_NUMBER: PageNumber, + ElementType.FORM_KEYS_VALUES: FormKeysValues, } diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index 166a02b398..1e6313aa06 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -30,6 +30,8 @@ def partition_image( extract_image_block_output_dir: Optional[str] = None, extract_image_block_to_payload: bool = False, date_from_file_object: bool = False, + extract_forms: bool = False, + **kwargs, ) -> List[Element]: """Parses an image into a list of interpreted elements. @@ -87,6 +89,9 @@ def partition_image( date_from_file_object Applies only when providing file via `file` parameter. If this option is True, attempt infer last_modified metadata from bytes, otherwise set it to None. + extract_forms + Whether the form extraction logic should be run + (results in adding FormKeysValues elements to output). """ exactly_one(filename=filename, file=file) @@ -107,5 +112,6 @@ def partition_image( extract_image_block_output_dir=extract_image_block_output_dir, extract_image_block_to_payload=extract_image_block_to_payload, date_from_file_object=date_from_file_object, + extract_forms=extract_forms, **kwargs, ) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 01b0c9804e..809858e166 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -155,6 +155,8 @@ def partition_pdf( extract_image_block_to_payload: bool = False, date_from_file_object: bool = False, starting_page_number: int = 1, + extract_forms: bool = False, + **kwargs: Any, ) -> List[Element]: """Parses a pdf document into a list of interpreted elements. @@ -232,6 +234,7 @@ def partition_pdf( extract_image_block_to_payload=extract_image_block_to_payload, date_from_file_object=date_from_file_object, starting_page_number=starting_page_number, + extract_forms=extract_forms, **kwargs, ) @@ -253,6 +256,8 @@ def partition_pdf_or_image( extract_image_block_to_payload: bool = False, date_from_file_object: bool = False, starting_page_number: int = 1, + extract_forms: bool = False, + **kwargs, ) -> List[Element]: """Parses a pdf or image document into a list of interpreted elements.""" @@ -323,7 +328,8 @@ def partition_pdf_or_image( extract_image_block_output_dir=extract_image_block_output_dir, extract_image_block_to_payload=extract_image_block_to_payload, starting_page_number=starting_page_number, - **kwargs, + extract_forms=extract_forms, + **kwargs, ) out_elements = _process_uncategorized_text_elements(elements) @@ -385,6 +391,10 @@ def get_the_last_modification_date_pdf_or_img( return last_modification_date +def run_form_extraction(filename: str, file: IO[bytes], model_name: str) -> List[FormMetadata]: + raise NotImplementedError("Form extraction not yet available.") + + @requires_dependencies("unstructured_inference") def _partition_pdf_or_image_local( filename: str = "", @@ -406,6 +416,7 @@ def _partition_pdf_or_image_local( analysis: bool = False, analyzed_image_output_dir_path: Optional[str] = None, starting_page_number: int = 1, + extract_forms: bool = False, **kwargs, ) -> List[Element]: """Partition using package installed locally""" @@ -523,6 +534,11 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, ) + if extract_forms: + forms = run_form_extraction(file=file, filename=filename, model_name=hi_res_model_name) + else: + forms = [] + # NOTE(alan): starting with v2, chipper sorts the elements itself. if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1": kwargs["sort_mode"] = SORT_MODE_DONT @@ -597,7 +613,7 @@ def _partition_pdf_or_image_local( if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"): out_elements.append(cast(Element, el)) - return out_elements + return out_elements + forms def _process_uncategorized_text_elements(elements: List[Element]): From a972dc5f820bc5fe39a8609006190fbaf0b849f6 Mon Sep 17 00:00:00 2001 From: Jan Kanty Milczek Date: Thu, 9 May 2024 14:20:54 +0200 Subject: [PATCH 02/12] moved form extraction placeholder to the end of partition_local --- unstructured/partition/pdf.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 36f27a5fc0..9c0a4aca83 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -514,11 +514,6 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, ) - if extract_forms: - forms = run_form_extraction(file=file, filename=filename, model_name=hi_res_model_name) - else: - forms = [] - # NOTE(alan): starting with v2, chipper sorts the elements itself. if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1": kwargs["sort_mode"] = SORT_MODE_DONT @@ -593,7 +588,10 @@ def _partition_pdf_or_image_local( if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"): out_elements.append(cast(Element, el)) - return out_elements + forms + if extract_forms: + out_elements.extend(run_form_extraction(file=file, filename=filename, model_name=hi_res_model_name)) + + return out_elements def _process_uncategorized_text_elements(elements: list[Element]): From 3751b110047b943950a5c628ec16ed254f1f531a Mon Sep 17 00:00:00 2001 From: Jan Kanty Milczek Date: Thu, 9 May 2024 14:25:18 +0200 Subject: [PATCH 03/12] fixed linting --- unstructured/partition/pdf.py | 42 +++++++++++++---------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 9c0a4aca83..c268a3b2a3 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -12,13 +12,7 @@ import pdf2image import wrapt from pdfminer import psparser -from pdfminer.layout import ( - LTChar, - LTContainer, - LTImage, - LTItem, - LTTextBox, -) +from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox from pdfminer.pdftypes import PDFObjRef from pdfminer.utils import open_filename from PIL import Image as PILImage @@ -35,6 +29,7 @@ Element, ElementMetadata, ElementType, + FormKeysValues, Image, Link, ListItem, @@ -42,10 +37,7 @@ Text, process_metadata, ) -from unstructured.file_utils.filetype import ( - FileType, - add_metadata_with_filetype, -) +from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.logger import logger, trace_logger from unstructured.nlp.patterns import PARAGRAPH_PATTERN from unstructured.partition.common import ( @@ -57,10 +49,7 @@ ocr_data_to_elements, spooled_to_bytes_io_if_needed, ) -from unstructured.partition.lang import ( - check_language_args, - prepare_languages_for_tesseract, -) +from unstructured.partition.lang import check_language_args, prepare_languages_for_tesseract from unstructured.partition.pdf_image.pdf_image_utils import ( annotate_layout_elements, check_element_types_to_extract, @@ -85,10 +74,7 @@ OCRMode, PartitionStrategy, ) -from unstructured.partition.utils.sorting import ( - coord_has_valid_points, - sort_page_elements, -) +from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements from unstructured.patches.pdfminer import parse_keyword from unstructured.utils import requires_dependencies @@ -137,7 +123,6 @@ def partition_pdf( date_from_file_object: bool = False, starting_page_number: int = 1, extract_forms: bool = False, - **kwargs: Any, ) -> list[Element]: """Parses a pdf document into a list of interpreted elements. @@ -194,6 +179,10 @@ def partition_pdf( date_from_file_object Applies only when providing file via `file` parameter. If this option is True, attempt infer last_modified metadata from bytes, otherwise set it to None. + extract_forms + Whether the form extraction logic should be run + (results in adding FormKeysValues elements to output). + """ exactly_one(filename=filename, file=file) @@ -309,7 +298,7 @@ def partition_pdf_or_image( extract_image_block_to_payload=extract_image_block_to_payload, starting_page_number=starting_page_number, extract_forms=extract_forms, - **kwargs, + **kwargs, ) out_elements = _process_uncategorized_text_elements(elements) @@ -371,7 +360,7 @@ def get_the_last_modification_date_pdf_or_img( return last_modification_date -def run_form_extraction(filename: str, file: IO[bytes], model_name: str) -> List[FormMetadata]: +def run_form_extraction(filename: str, file: IO[bytes], model_name: str) -> list[FormKeysValues]: raise NotImplementedError("Form extraction not yet available.") @@ -405,10 +394,7 @@ def _partition_pdf_or_image_local( process_file_with_model, ) - from unstructured.partition.pdf_image.ocr import ( - process_data_with_ocr, - process_file_with_ocr, - ) + from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr from unstructured.partition.pdf_image.pdfminer_processing import ( process_data_with_pdfminer, process_file_with_pdfminer, @@ -589,7 +575,9 @@ def _partition_pdf_or_image_local( out_elements.append(cast(Element, el)) if extract_forms: - out_elements.extend(run_form_extraction(file=file, filename=filename, model_name=hi_res_model_name)) + out_elements.extend( + run_form_extraction(file=file, filename=filename, model_name=hi_res_model_name) + ) return out_elements From abf0327d5a44c267073d0ca574fe328974a7dca6 Mon Sep 17 00:00:00 2001 From: Jan Kanty Milczek Date: Thu, 9 May 2024 14:28:02 +0200 Subject: [PATCH 04/12] changelog update --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85c9d6a298..76eea3b138 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ### Enhancements ### Features +* **Add form extraction basics (document elements and placeholder code in partition) ### Fixes From 67f8bc757e3b0b1c8710b599ab902be4e37677a3 Mon Sep 17 00:00:00 2001 From: Jan Kanty Milczek Date: Thu, 9 May 2024 16:38:44 +0200 Subject: [PATCH 05/12] added test --- example-docs/fake_form_element/form.json | 149 +++++++++++++++++++ test_unstructured/documents/test_elements.py | 15 +- unstructured/documents/elements.py | 35 +---- unstructured/documents/form_utils.py | 32 ++++ 4 files changed, 202 insertions(+), 29 deletions(-) create mode 100644 example-docs/fake_form_element/form.json create mode 100644 unstructured/documents/form_utils.py diff --git a/example-docs/fake_form_element/form.json b/example-docs/fake_form_element/form.json new file mode 100644 index 0000000000..43bead5444 --- /dev/null +++ b/example-docs/fake_form_element/form.json @@ -0,0 +1,149 @@ +[ + { + "type": "FormKeysValues", + "element_id": "MOCK_FORM_ID", + "text": "", + "metadata": { + "coordinates": { + "points": [ + [ + 35.15625, + 95.556640625 + ], + [ + 710.357666015625, + 95.556640625 + ], + [ + 710.357666015625, + 887.890625 + ], + [ + 35.15625, + 887.890625 + ] + ], + "system": "PixelSpace", + "layout_width": 754, + "layout_height": 1000 + }, + "page_number": 1, + "key_value_pairs": [ + { + "key": { + "text": "MOCK KEY", + "custom_element": { + "type": "UncategorizedText", + "element_id": "MOCK_KEY_ID_1", + "text": "MOCK KEY", + "metadata": { + "coordinates": { + "points": [ + [ + 503.271484375, + 96.3897705078125 + ], + [ + 503.271484375, + 107.5164794921875 + ], + [ + 606.103515625, + 107.5164794921875 + ], + [ + 606.103515625, + 96.3897705078125 + ] + ], + "system": "PixelSpace", + "layout_width": 754, + "layout_height": 1000 + }, + "page_number": 1 + } + }, + "layout_element_id": null + }, + "value": { + "text": "MOCK VALUE", + "custom_element": { + "type": "UncategorizedText", + "element_id": "MOCK_VALUE_ID", + "text": "MOCK VALUE", + "metadata": { + "coordinates": { + "points": [ + [ + 557.568359375, + 124.8626708984375 + ], + [ + 557.568359375, + 136.6607666015625 + ], + [ + 595.556640625, + 136.6607666015625 + ], + [ + 595.556640625, + 124.8626708984375 + ] + ], + "system": "PixelSpace", + "layout_width": 754, + "layout_height": 1000 + }, + "page_number": 1 + } + }, + "layout_element_id": null + }, + "confidence": 0.0 + }, + { + "key": { + "text": "MOCK KEY 2", + "custom_element": { + "type": "UncategorizedText", + "element_id": "MOCK_KEY_ID_2", + "text": "JUNE 30]\n", + "metadata": { + "coordinates": { + "points": [ + [ + 428.52783203125, + 124.0478515625 + ], + [ + 428.52783203125, + 136.6943359375 + ], + [ + 473.81591796875, + 136.6943359375 + ], + [ + 473.81591796875, + 124.0478515625 + ] + ], + "system": "PixelSpace", + "layout_width": 754, + "layout_height": 1000 + }, + "page_number": 1 + } + }, + "layout_element_id": null + }, + "value": null, + "confidence": 0.0 + } + ], + "file_directory": "dataset/testing_data/images", + "filename": "82253245_3247.png" + } + } +] \ No newline at end of file diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index b176e40c0b..0c65b9d216 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -5,13 +5,14 @@ from __future__ import annotations import copy +import io import json import pathlib from functools import partial import pytest -from test_unstructured.unit_utils import assign_hash_ids +from test_unstructured.unit_utils import assign_hash_ids, example_doc_path from unstructured.cleaners.core import clean_bullets, clean_prefix from unstructured.documents.coordinates import ( CoordinateSystem, @@ -31,6 +32,7 @@ Title, assign_and_map_hash_ids, ) +from unstructured.partition.json import partition_json @pytest.mark.parametrize("element", [Element(), Text(text=""), CheckBox()]) @@ -744,3 +746,14 @@ def test_id_to_hash_calculates(text, sequence_number, filename, page_number, exp ) assert element.id_to_hash(sequence_number) == expected_hash, "Returned ID does not match" assert element.id == expected_hash, "ID should be set" + + +def test_formskeysvalues_reads_saves(): + filename = example_doc_path("fake_form_element/form.json") + with open(filename) as inF: + as_read = partition_json(filename=filename) + tmp_file = io.StringIO() + json.dump([element.to_dict() for element in as_read], tmp_file) + tmp_file.seek(0) + as_read_2 = partition_json(file=tmp_file) + assert as_read == as_read_2 diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index ef0a765316..4132df99c6 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -156,33 +156,6 @@ class FormKeyValuePair(TypedDict): confidence: float -def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]: - from unstructured.staging.base import elements_from_dicts - - # safe to overwrite - deepcopy already happened - for kv_pair in kv_pairs: - if kv_pair["key"]["custom_element"] is not None: - (kv_pair["key"]["custom_element"],) = elements_from_dicts( - [kv_pair["key"]["custom_element"]] - ) - if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None: - (kv_pair["value"]["custom_element"],) = elements_from_dicts( - [kv_pair["value"]["custom_element"]] - ) - return kv_pairs - - -def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]: - kv_pairs: list[dict] = copy.copy(kv_pairs) # deepcopy already happened - for kv_pair in kv_pairs: - if kv_pair["key"]["custom_element"] is not None: - kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict() - if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None: - kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict() - - return kv_pairs - - class ElementMetadata: """Fully-dynamic replacement for dataclass-based ElementMetadata.""" @@ -369,6 +342,8 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata: elif field_name == "orig_elements": self.orig_elements = elements_from_base64_gzipped_json(field_value) elif field_name == "key_value_pairs": + from unstructured.documents.form_utils import _kvform_rehydrate_internal_elements + self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value) else: setattr(self, field_name, field_value) @@ -436,6 +411,8 @@ def to_dict(self) -> dict[str, Any]: if self.orig_elements is not None: meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements) if self.key_value_pairs is not None: + from unstructured.documents.form_utils import _kvform_pairs_to_dict + meta_dict["key_value_pairs"] = _kvform_pairs_to_dict(self.key_value_pairs) return meta_dict @@ -1042,7 +1019,9 @@ class PageNumber(Text): category = "PageNumber" -class FormKeysValues(Element): +class FormKeysValues(Text): + """An element for capturing Key-Value dicts (forms).""" + category = "FormKeysValues" diff --git a/unstructured/documents/form_utils.py b/unstructured/documents/form_utils.py new file mode 100644 index 0000000000..1b863fa2a0 --- /dev/null +++ b/unstructured/documents/form_utils.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import copy + +from unstructured.documents.elements import FormKeyValuePair + + +def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]: + from unstructured.staging.base import elements_from_dicts + + # safe to overwrite - deepcopy already happened + for kv_pair in kv_pairs: + if kv_pair["key"]["custom_element"] is not None: + (kv_pair["key"]["custom_element"],) = elements_from_dicts( + [kv_pair["key"]["custom_element"]] + ) + if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None: + (kv_pair["value"]["custom_element"],) = elements_from_dicts( + [kv_pair["value"]["custom_element"]] + ) + return kv_pairs + + +def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]: + kv_pairs: list[dict] = copy.deepcopy(kv_pairs) + for kv_pair in kv_pairs: + if kv_pair["key"]["custom_element"] is not None: + kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict() + if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None: + kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict() + + return kv_pairs From 23e897eac58005d5f4cc23dc8c76856b4d8e0fa7 Mon Sep 17 00:00:00 2001 From: Jan Kanty Milczek Date: Thu, 16 May 2024 12:39:18 +0200 Subject: [PATCH 06/12] added form_extraction_skip_tables argument --- unstructured/partition/image.py | 8 +- unstructured/partition/pdf.py | 106 ++++++++---------- .../partition/pdf_image/form_extraction.py | 15 +++ 3 files changed, 66 insertions(+), 63 deletions(-) create mode 100644 unstructured/partition/pdf_image/form_extraction.py diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index eb1ed1171b..8fa98db38b 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -6,9 +6,7 @@ from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import add_metadata from unstructured.partition.common import exactly_one -from unstructured.partition.lang import ( - check_language_args, -) +from unstructured.partition.lang import check_language_args from unstructured.partition.pdf import partition_pdf_or_image from unstructured.partition.utils.constants import PartitionStrategy @@ -34,6 +32,7 @@ def partition_image( date_from_file_object: bool = False, starting_page_number: int = 1, extract_forms: bool = False, + form_extraction_skip_tables: bool = True, **kwargs: Any, ) -> list[Element]: """Parses an image into a list of interpreted elements. @@ -94,6 +93,8 @@ def partition_image( extract_forms Whether the form extraction logic should be run (results in adding FormKeysValues elements to output). + form_extraction_skip_tables + Whether the form extraction logic should ignore regions designated as Tables. """ exactly_one(filename=filename, file=file) @@ -116,5 +117,6 @@ def partition_image( date_from_file_object=date_from_file_object, starting_page_number=starting_page_number, extract_forms=extract_forms, + form_extraction_skip_tables=form_extraction_skip_tables, **kwargs, ) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 367d20d84c..9feff6d352 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -21,60 +21,43 @@ from unstructured.chunking import add_chunking_strategy from unstructured.cleaners.core import ( clean_extra_whitespace_with_index_run, - index_adjustment_after_clean_extra_whitespace, -) + index_adjustment_after_clean_extra_whitespace) from unstructured.documents.coordinates import PixelSpace, PointSpace -from unstructured.documents.elements import ( - CoordinatesMetadata, - Element, - ElementMetadata, - ElementType, - FormKeysValues, - Image, - Link, - ListItem, - PageBreak, - Text, - process_metadata, -) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.documents.elements import (CoordinatesMetadata, Element, + ElementMetadata, ElementType, + Image, Link, ListItem, PageBreak, + Text, process_metadata) +from unstructured.file_utils.filetype import (FileType, + add_metadata_with_filetype) from unstructured.logger import logger, trace_logger from unstructured.nlp.patterns import PARAGRAPH_PATTERN -from unstructured.partition.common import ( - convert_to_bytes, - document_to_element_list, - exactly_one, - get_last_modified_date, - get_last_modified_date_from_file, - ocr_data_to_elements, - spooled_to_bytes_io_if_needed, -) -from unstructured.partition.lang import check_language_args, prepare_languages_for_tesseract +from unstructured.partition.common import (convert_to_bytes, + document_to_element_list, + exactly_one, get_last_modified_date, + get_last_modified_date_from_file, + ocr_data_to_elements, + spooled_to_bytes_io_if_needed) +from unstructured.partition.lang import (check_language_args, + prepare_languages_for_tesseract) +from unstructured.partition.pdf_image.form_extraction import \ + run_form_extraction from unstructured.partition.pdf_image.pdf_image_utils import ( - annotate_layout_elements, - check_element_types_to_extract, - save_elements, -) + annotate_layout_elements, check_element_types_to_extract, save_elements) from unstructured.partition.pdf_image.pdfminer_processing import ( - clean_pdfminer_duplicate_image_elements, - clean_pdfminer_inner_elements, - merge_inferred_with_extracted_layout, -) + clean_pdfminer_duplicate_image_elements, clean_pdfminer_inner_elements, + merge_inferred_with_extracted_layout) from unstructured.partition.pdf_image.pdfminer_utils import ( - open_pdfminer_pages_generator, - rect_to_bbox, -) -from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy + open_pdfminer_pages_generator, rect_to_bbox) +from unstructured.partition.strategies import (determine_pdf_or_image_strategy, + validate_strategy) from unstructured.partition.text import element_from_text from unstructured.partition.utils.config import env_config -from unstructured.partition.utils.constants import ( - SORT_MODE_BASIC, - SORT_MODE_DONT, - SORT_MODE_XY_CUT, - OCRMode, - PartitionStrategy, -) -from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements +from unstructured.partition.utils.constants import (SORT_MODE_BASIC, + SORT_MODE_DONT, + SORT_MODE_XY_CUT, OCRMode, + PartitionStrategy) +from unstructured.partition.utils.sorting import (coord_has_valid_points, + sort_page_elements) from unstructured.patches.pdfminer import parse_keyword from unstructured.utils import requires_dependencies @@ -122,6 +105,7 @@ def partition_pdf( date_from_file_object: bool = False, starting_page_number: int = 1, extract_forms: bool = False, + form_extraction_skip_tables: bool = True, **kwargs: Any, ) -> list[Element]: """Parses a pdf document into a list of interpreted elements. @@ -181,7 +165,8 @@ def partition_pdf( extract_forms Whether the form extraction logic should be run (results in adding FormKeysValues elements to output). - + form_extraction_skip_tables + Whether the form extraction logic should ignore regions designated as Tables. """ exactly_one(filename=filename, file=file) @@ -226,6 +211,7 @@ def partition_pdf_or_image( date_from_file_object: bool = False, starting_page_number: int = 1, extract_forms: bool = False, + form_extraction_skip_tables: bool = True, **kwargs: Any, ) -> list[Element]: """Parses a pdf or image document into a list of interpreted elements.""" @@ -298,6 +284,7 @@ def partition_pdf_or_image( extract_image_block_to_payload=extract_image_block_to_payload, starting_page_number=starting_page_number, extract_forms=extract_forms, + form_extraction_skip_tables=form_extraction_skip_tables, **kwargs, ) out_elements = _process_uncategorized_text_elements(elements) @@ -363,10 +350,6 @@ def get_the_last_modification_date_pdf_or_img( return last_modification_date -def run_form_extraction(filename: str, file: IO[bytes], model_name: str) -> list[FormKeysValues]: - raise NotImplementedError("Form extraction not yet available.") - - @requires_dependencies("unstructured_inference") def _partition_pdf_or_image_local( filename: str = "", @@ -389,19 +372,17 @@ def _partition_pdf_or_image_local( analyzed_image_output_dir_path: Optional[str] = None, starting_page_number: int = 1, extract_forms: bool = False, + form_extraction_skip_tables: bool = True, **kwargs: Any, ) -> list[Element]: """Partition using package installed locally""" from unstructured_inference.inference.layout import ( - process_data_with_model, - process_file_with_model, - ) + process_data_with_model, process_file_with_model) - from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr + from unstructured.partition.pdf_image.ocr import (process_data_with_ocr, + process_file_with_ocr) from unstructured.partition.pdf_image.pdfminer_processing import ( - process_data_with_pdfminer, - process_file_with_pdfminer, - ) + process_data_with_pdfminer, process_file_with_pdfminer) if languages is None: languages = ["eng"] @@ -578,9 +559,14 @@ def _partition_pdf_or_image_local( out_elements.append(cast(Element, el)) if extract_forms: - out_elements.extend( - run_form_extraction(file=file, filename=filename, model_name=hi_res_model_name) + forms = run_form_extraction( + file=file, + filename=filename, + model_name=hi_res_model_name, + elements=out_elements, + skip_table_regions=form_extraction_skip_tables, ) + out_elements.extend(forms) return out_elements diff --git a/unstructured/partition/pdf_image/form_extraction.py b/unstructured/partition/pdf_image/form_extraction.py new file mode 100644 index 0000000000..3e63c486de --- /dev/null +++ b/unstructured/partition/pdf_image/form_extraction.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from typing import IO + +from unstructured.documents.elements import Element, FormKeysValues + + +def run_form_extraction( + filename: str, + file: IO[bytes], + model_name: str, + elements: list[Element], + skip_table_regions: bool, +) -> list[FormKeysValues]: + raise NotImplementedError("Form extraction not yet available.") From bdd1ddb7b64d6e0931ba3fd780fa27458a2fd9c0 Mon Sep 17 00:00:00 2001 From: Jan Kanty Milczek Date: Thu, 16 May 2024 13:18:53 +0200 Subject: [PATCH 07/12] fixed import sorting with ruff instead of isort --- unstructured/partition/pdf.py | 86 +++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 33 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 9feff6d352..1463fe87aa 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -21,43 +21,60 @@ from unstructured.chunking import add_chunking_strategy from unstructured.cleaners.core import ( clean_extra_whitespace_with_index_run, - index_adjustment_after_clean_extra_whitespace) + index_adjustment_after_clean_extra_whitespace, +) from unstructured.documents.coordinates import PixelSpace, PointSpace -from unstructured.documents.elements import (CoordinatesMetadata, Element, - ElementMetadata, ElementType, - Image, Link, ListItem, PageBreak, - Text, process_metadata) -from unstructured.file_utils.filetype import (FileType, - add_metadata_with_filetype) +from unstructured.documents.elements import ( + CoordinatesMetadata, + Element, + ElementMetadata, + ElementType, + Image, + Link, + ListItem, + PageBreak, + Text, + process_metadata, +) +from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.logger import logger, trace_logger from unstructured.nlp.patterns import PARAGRAPH_PATTERN -from unstructured.partition.common import (convert_to_bytes, - document_to_element_list, - exactly_one, get_last_modified_date, - get_last_modified_date_from_file, - ocr_data_to_elements, - spooled_to_bytes_io_if_needed) -from unstructured.partition.lang import (check_language_args, - prepare_languages_for_tesseract) -from unstructured.partition.pdf_image.form_extraction import \ - run_form_extraction +from unstructured.partition.common import ( + convert_to_bytes, + document_to_element_list, + exactly_one, + get_last_modified_date, + get_last_modified_date_from_file, + ocr_data_to_elements, + spooled_to_bytes_io_if_needed, +) +from unstructured.partition.lang import check_language_args, prepare_languages_for_tesseract +from unstructured.partition.pdf_image.form_extraction import run_form_extraction from unstructured.partition.pdf_image.pdf_image_utils import ( - annotate_layout_elements, check_element_types_to_extract, save_elements) + annotate_layout_elements, + check_element_types_to_extract, + save_elements, +) from unstructured.partition.pdf_image.pdfminer_processing import ( - clean_pdfminer_duplicate_image_elements, clean_pdfminer_inner_elements, - merge_inferred_with_extracted_layout) + clean_pdfminer_duplicate_image_elements, + clean_pdfminer_inner_elements, + merge_inferred_with_extracted_layout, +) from unstructured.partition.pdf_image.pdfminer_utils import ( - open_pdfminer_pages_generator, rect_to_bbox) -from unstructured.partition.strategies import (determine_pdf_or_image_strategy, - validate_strategy) + open_pdfminer_pages_generator, + rect_to_bbox, +) +from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy from unstructured.partition.text import element_from_text from unstructured.partition.utils.config import env_config -from unstructured.partition.utils.constants import (SORT_MODE_BASIC, - SORT_MODE_DONT, - SORT_MODE_XY_CUT, OCRMode, - PartitionStrategy) -from unstructured.partition.utils.sorting import (coord_has_valid_points, - sort_page_elements) +from unstructured.partition.utils.constants import ( + SORT_MODE_BASIC, + SORT_MODE_DONT, + SORT_MODE_XY_CUT, + OCRMode, + PartitionStrategy, +) +from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements from unstructured.patches.pdfminer import parse_keyword from unstructured.utils import requires_dependencies @@ -377,12 +394,15 @@ def _partition_pdf_or_image_local( ) -> list[Element]: """Partition using package installed locally""" from unstructured_inference.inference.layout import ( - process_data_with_model, process_file_with_model) + process_data_with_model, + process_file_with_model, + ) - from unstructured.partition.pdf_image.ocr import (process_data_with_ocr, - process_file_with_ocr) + from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr from unstructured.partition.pdf_image.pdfminer_processing import ( - process_data_with_pdfminer, process_file_with_pdfminer) + process_data_with_pdfminer, + process_file_with_pdfminer, + ) if languages is None: languages = ["eng"] From 3de5829ea974560d57345a08c558d514da68c9c7 Mon Sep 17 00:00:00 2001 From: Jan Kanty Milczek Date: Thu, 16 May 2024 13:27:00 +0200 Subject: [PATCH 08/12] removed useless closure from test elements --- test_unstructured/documents/test_elements.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index 0c65b9d216..83dced0ba6 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -750,8 +750,7 @@ def test_id_to_hash_calculates(text, sequence_number, filename, page_number, exp def test_formskeysvalues_reads_saves(): filename = example_doc_path("fake_form_element/form.json") - with open(filename) as inF: - as_read = partition_json(filename=filename) + as_read = partition_json(filename=filename) tmp_file = io.StringIO() json.dump([element.to_dict() for element in as_read], tmp_file) tmp_file.seek(0) From 1e09082c7f0d0461e03ce050aeda346d7ec0993b Mon Sep 17 00:00:00 2001 From: Jan Kanty Milczek Date: Thu, 16 May 2024 13:28:28 +0200 Subject: [PATCH 09/12] removed some (non-dangerous) leaked info from fake form --- example-docs/fake_form_element/form.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example-docs/fake_form_element/form.json b/example-docs/fake_form_element/form.json index 43bead5444..468fc4468e 100644 --- a/example-docs/fake_form_element/form.json +++ b/example-docs/fake_form_element/form.json @@ -108,7 +108,7 @@ "custom_element": { "type": "UncategorizedText", "element_id": "MOCK_KEY_ID_2", - "text": "JUNE 30]\n", + "text": "MOCK KEY 2", "metadata": { "coordinates": { "points": [ @@ -143,7 +143,7 @@ } ], "file_directory": "dataset/testing_data/images", - "filename": "82253245_3247.png" + "filename": "MOCK.png" } } ] \ No newline at end of file From bae66aa06180549b3096de83da0bf49beaca9863 Mon Sep 17 00:00:00 2001 From: Jan Kanty Milczek Date: Thu, 16 May 2024 14:19:02 +0200 Subject: [PATCH 10/12] fixed weaviate conversion and ConsolidationStrategy for FormKeysValues --- unstructured/documents/elements.py | 1 + unstructured/staging/weaviate.py | 1 + 2 files changed, 2 insertions(+) diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 49bf99f34b..7606ffa5eb 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -515,6 +515,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]: "text_as_html": cls.FIRST, # -- only occurs in Table -- "table_as_cells": cls.FIRST, # -- only occurs in Table -- "url": cls.FIRST, + "key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues -- } diff --git a/unstructured/staging/weaviate.py b/unstructured/staging/weaviate.py index b8bafd9c7a..469d95cfe3 100644 --- a/unstructured/staging/weaviate.py +++ b/unstructured/staging/weaviate.py @@ -17,6 +17,7 @@ class Properties(TypedDict): "links", "orig_elements", "regex_metadata", + "key_value_pairs", ) From b0d6bea2c042d6b55db1c1ac9388ea9eb75d989c Mon Sep 17 00:00:00 2001 From: Jan Kanty Milczek Date: Thu, 16 May 2024 14:47:50 +0200 Subject: [PATCH 11/12] added docstrings to de-/re-hydration strategies for FormKeysValues metadata, also small qol changes --- .../unstructured_output}/form.json | 0 test_unstructured/documents/test_elements.py | 2 +- unstructured/documents/elements.py | 42 +++++++++++++++++-- unstructured/documents/form_utils.py | 32 -------------- 4 files changed, 39 insertions(+), 37 deletions(-) rename example-docs/{fake_form_element => test_evaluate_files/unstructured_output}/form.json (100%) delete mode 100644 unstructured/documents/form_utils.py diff --git a/example-docs/fake_form_element/form.json b/example-docs/test_evaluate_files/unstructured_output/form.json similarity index 100% rename from example-docs/fake_form_element/form.json rename to example-docs/test_evaluate_files/unstructured_output/form.json diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index 83dced0ba6..7b3ff7360c 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -749,7 +749,7 @@ def test_id_to_hash_calculates(text, sequence_number, filename, page_number, exp def test_formskeysvalues_reads_saves(): - filename = example_doc_path("fake_form_element/form.json") + filename = example_doc_path("test_evaluate_files/unstructured_output/form.json") as_read = partition_json(filename=filename) tmp_file = io.StringIO() json.dump([element.to_dict() for element in as_read], tmp_file) diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 7606ffa5eb..acf21f88fe 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -341,8 +341,6 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata: elif field_name == "orig_elements": self.orig_elements = elements_from_base64_gzipped_json(field_value) elif field_name == "key_value_pairs": - from unstructured.documents.form_utils import _kvform_rehydrate_internal_elements - self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value) else: setattr(self, field_name, field_value) @@ -410,8 +408,6 @@ def to_dict(self) -> dict[str, Any]: if self.orig_elements is not None: meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements) if self.key_value_pairs is not None: - from unstructured.documents.form_utils import _kvform_pairs_to_dict - meta_dict["key_value_pairs"] = _kvform_pairs_to_dict(self.key_value_pairs) return meta_dict @@ -1060,3 +1056,41 @@ class FormKeysValues(Text): ElementType.PAGE_NUMBER: PageNumber, ElementType.FORM_KEYS_VALUES: FormKeysValues, } + + +def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]: + """ + The key_value_pairs metadata field contains (in the vast majority of cases) + nested Text elements. Those need to be turned from dicts into Elements explicitly, + e.g. when partition_json is used. + """ + from unstructured.staging.base import elements_from_dicts + + # safe to overwrite - deepcopy already happened + for kv_pair in kv_pairs: + if kv_pair["key"]["custom_element"] is not None: + (kv_pair["key"]["custom_element"],) = elements_from_dicts( + [kv_pair["key"]["custom_element"]] + ) + if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None: + (kv_pair["value"]["custom_element"],) = elements_from_dicts( + [kv_pair["value"]["custom_element"]] + ) + return kv_pairs + + +def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]: + """ + The key_value_pairs metadata field contains (in the vast majority of cases) + nested Text elements. Those need to be turned from Elements to dicts recursively, + e.g. when FormKeysValues.to_dict() is used. + + """ + kv_pairs: list[dict] = copy.deepcopy(kv_pairs) + for kv_pair in kv_pairs: + if kv_pair["key"]["custom_element"] is not None: + kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict() + if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None: + kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict() + + return kv_pairs diff --git a/unstructured/documents/form_utils.py b/unstructured/documents/form_utils.py deleted file mode 100644 index 1b863fa2a0..0000000000 --- a/unstructured/documents/form_utils.py +++ /dev/null @@ -1,32 +0,0 @@ -from __future__ import annotations - -import copy - -from unstructured.documents.elements import FormKeyValuePair - - -def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]: - from unstructured.staging.base import elements_from_dicts - - # safe to overwrite - deepcopy already happened - for kv_pair in kv_pairs: - if kv_pair["key"]["custom_element"] is not None: - (kv_pair["key"]["custom_element"],) = elements_from_dicts( - [kv_pair["key"]["custom_element"]] - ) - if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None: - (kv_pair["value"]["custom_element"],) = elements_from_dicts( - [kv_pair["value"]["custom_element"]] - ) - return kv_pairs - - -def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]: - kv_pairs: list[dict] = copy.deepcopy(kv_pairs) - for kv_pair in kv_pairs: - if kv_pair["key"]["custom_element"] is not None: - kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict() - if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None: - kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict() - - return kv_pairs From e96f66440583d35ed6ee3289bf2cc8984fa2080f Mon Sep 17 00:00:00 2001 From: Jan Kanty Milczek Date: Thu, 16 May 2024 15:50:02 +0200 Subject: [PATCH 12/12] version bump, bigger changelog entry --- CHANGELOG.md | 4 ++-- unstructured/__version__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c5b6c9088..cd879e566a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.13.8-dev12 +## 0.13.8-dev13 ### Enhancements @@ -7,7 +7,7 @@ * **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy. ### Features -* **Add form extraction basics (document elements and placeholder code in partition) +* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`. ### Fixes diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a10aece8ba..9d410b5244 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.8-dev12" # pragma: no cover +__version__ = "0.13.8-dev13" # pragma: no cover