From 07f44638e83313d8e999b134711fa1da5fbeffae Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 15 May 2024 14:13:51 -0700 Subject: [PATCH 1/6] feat: skip element sorting when determining whether embedded text is extractable --- unstructured/partition/pdf.py | 49 +++++++++++++++++------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 0371866a77..3f1b227634 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -259,14 +259,15 @@ def partition_pdf_or_image( extracted_elements = extractable_elements( filename=filename, file=spooled_to_bytes_io_if_needed(file), - include_page_breaks=include_page_breaks, languages=languages, metadata_last_modified=metadata_last_modified or last_modification_date, starting_page_number=starting_page_number, **kwargs, ) pdf_text_extractable = any( - isinstance(el, Text) and el.text.strip() for el in extracted_elements + isinstance(el, Text) and el.text.strip() + for page_elements in extracted_elements + for el in page_elements ) except Exception as e: logger.error(e) @@ -308,7 +309,22 @@ def partition_pdf_or_image( out_elements = _process_uncategorized_text_elements(elements) elif strategy == PartitionStrategy.FAST: - return extracted_elements + out_elements = [] + sort_mode: str = SORT_MODE_XY_CUT + + for page_elements in extracted_elements: + # NOTE(crag, christine): always do the basic sort first for deterministic order across + # python versions. + sorted_page_elements = sort_page_elements(page_elements, SORT_MODE_BASIC) + if sort_mode != SORT_MODE_BASIC: + sorted_page_elements = sort_page_elements(sorted_page_elements, sort_mode) + + out_elements += sorted_page_elements + + if include_page_breaks: + out_elements.append(PageBreak(text="")) + + return out_elements elif strategy == PartitionStrategy.OCR_ONLY: # NOTE(robinson): Catches file conversion warnings when running with PDFs @@ -331,18 +347,16 @@ def partition_pdf_or_image( def extractable_elements( filename: str = "", file: Optional[bytes | IO[bytes]] = None, - include_page_breaks: bool = False, languages: Optional[list[str]] = None, metadata_last_modified: Optional[str] = None, starting_page_number: int = 1, **kwargs: Any, -): +) -> list[list[Element]]: if isinstance(file, bytes): file = io.BytesIO(file) return _partition_pdf_with_pdfminer( filename=filename, file=file, - include_page_breaks=include_page_breaks, languages=languages, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, @@ -600,12 +614,11 @@ def _process_uncategorized_text_elements(elements: list[Element]): def _partition_pdf_with_pdfminer( filename: str, file: Optional[IO[bytes]], - include_page_breaks: bool, languages: list[str], metadata_last_modified: Optional[str], starting_page_number: int = 1, **kwargs: Any, -) -> list[Element]: +) -> list[list[Element]]: """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster processing or detectron2 is not available. @@ -624,7 +637,6 @@ def _partition_pdf_with_pdfminer( elements = _process_pdfminer_pages( fp=fp, filename=filename, - include_page_breaks=include_page_breaks, languages=languages, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, @@ -635,7 +647,6 @@ def _partition_pdf_with_pdfminer( elements = _process_pdfminer_pages( fp=file, filename=filename, - include_page_breaks=include_page_breaks, languages=languages, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, @@ -681,17 +692,15 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs): def _process_pdfminer_pages( fp: IO[bytes], filename: str, - include_page_breaks: bool, languages: list[str], metadata_last_modified: Optional[str], - sort_mode: str = SORT_MODE_XY_CUT, annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD, starting_page_number: int = 1, **kwargs, -): +) -> list[list[Element]]: """Uses PDFMiner to split a document into pages and process them.""" - elements: list[Element] = [] + elements = [] for page_number, (page, page_layout) in enumerate( open_pdfminer_pages_generator(fp), start=starting_page_number @@ -758,17 +767,7 @@ def _process_pdfminer_pages( page_elements.append(element) page_elements = _combine_list_elements(page_elements, coordinate_system) - - # NOTE(crag, christine): always do the basic sort first for determinsitic order across - # python versions. - sorted_page_elements = sort_page_elements(page_elements, SORT_MODE_BASIC) - if sort_mode != SORT_MODE_BASIC: - sorted_page_elements = sort_page_elements(sorted_page_elements, sort_mode) - - elements += sorted_page_elements - - if include_page_breaks: - elements.append(PageBreak(text="")) + elements.append(page_elements) return elements From 613bc3e25011197b3f61551dd255e985209a2576 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 15 May 2024 14:29:35 -0700 Subject: [PATCH 2/6] feat: add `_partition_pdf_with_pdfparser()`` for `fast` strategy pipeline --- unstructured/partition/pdf.py | 43 +++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 3f1b227634..ba11c01898 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -286,7 +286,7 @@ def partition_pdf_or_image( file.seek(0) if strategy == PartitionStrategy.HI_RES: - # NOTE(robinson): Catches a UserWarning that occurs when detectron is called + # NOTE(robinson): Catches a UserWarning that occurs when detection is called with warnings.catch_warnings(): warnings.simplefilter("ignore") elements = _partition_pdf_or_image_local( @@ -309,20 +309,10 @@ def partition_pdf_or_image( out_elements = _process_uncategorized_text_elements(elements) elif strategy == PartitionStrategy.FAST: - out_elements = [] - sort_mode: str = SORT_MODE_XY_CUT - - for page_elements in extracted_elements: - # NOTE(crag, christine): always do the basic sort first for deterministic order across - # python versions. - sorted_page_elements = sort_page_elements(page_elements, SORT_MODE_BASIC) - if sort_mode != SORT_MODE_BASIC: - sorted_page_elements = sort_page_elements(sorted_page_elements, sort_mode) - - out_elements += sorted_page_elements - - if include_page_breaks: - out_elements.append(PageBreak(text="")) + out_elements = _partition_pdf_with_pdfparser( + extracted_elements=extracted_elements, + include_page_breaks=include_page_breaks, + ) return out_elements @@ -848,6 +838,29 @@ def _combine_coordinates_into_element1( return copy.deepcopy(element1) +def _partition_pdf_with_pdfparser( + extracted_elements: list[list[Element]], + include_page_breaks: bool = False, + sort_mode: str = SORT_MODE_XY_CUT, +): + """Partitions a PDF using pdfparser.""" + elements = [] + + for page_elements in extracted_elements: + # NOTE(crag, christine): always do the basic sort first for deterministic order across + # python versions. + sorted_page_elements = sort_page_elements(page_elements, SORT_MODE_BASIC) + if sort_mode != SORT_MODE_BASIC: + sorted_page_elements = sort_page_elements(sorted_page_elements, sort_mode) + + elements += sorted_page_elements + + if include_page_breaks: + elements.append(PageBreak(text="")) + + return elements + + def convert_pdf_to_images( filename: str = "", file: Optional[bytes | IO[bytes]] = None, From 7663d8ba9f54aa4cbdf73bc80b87b767da932aa3 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 15 May 2024 15:16:58 -0700 Subject: [PATCH 3/6] chore: update changelog & version --- CHANGELOG.md | 3 ++- unstructured/__version__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 28b09f0c14..863dc4685b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.13.8-dev11 +## 0.13.8-dev12 ### Enhancements +* **Skip unnecessary element sorting**. Skip element sorting when determining whether embedded text is extractable. * **Faster evaluation** Support for concurrent processing of documents during evaluation * **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy. diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d0f6aafe83..a10aece8ba 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.8-dev11" # pragma: no cover +__version__ = "0.13.8-dev12" # pragma: no cover From fe5d7563b16fcac397f9dabc7933ce5ba168468f Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 15 May 2024 15:21:40 -0700 Subject: [PATCH 4/6] chore: update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 863dc4685b..5c6fbd2852 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ### Enhancements -* **Skip unnecessary element sorting**. Skip element sorting when determining whether embedded text is extractable. +* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text is extractable. * **Faster evaluation** Support for concurrent processing of documents during evaluation * **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy. From 061b306be7cffe07f999531093b2ead4467d001f Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 15 May 2024 15:32:39 -0700 Subject: [PATCH 5/6] chore: update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c6fbd2852..bef231d402 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ### Enhancements -* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text is extractable. +* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted. * **Faster evaluation** Support for concurrent processing of documents during evaluation * **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy. From e860c7673723aa42e79fe60c23dba40ac0be1705 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 15 May 2024 16:50:58 -0700 Subject: [PATCH 6/6] test: update unit tests --- test_unstructured/partition/pdf_image/test_pdf.py | 4 ++-- test_unstructured/partition/test_strategies.py | 13 ++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index edb64ef133..48a7ffd91d 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -383,7 +383,7 @@ def mock_exists(dep): monkeypatch.setattr(strategies, "dependency_exists", mock_exists) - mock_return = [Text("Hello there!")] + mock_return = [[Text("Hello there!")], []] with mock.patch.object( pdf, "extractable_elements", @@ -405,7 +405,7 @@ def mock_exists(dep): monkeypatch.setattr(strategies, "dependency_exists", mock_exists) - mock_return = [Text("Hello there!")] + mock_return = [[Text("Hello there!")], []] with mock.patch.object( pdf, "extractable_elements", diff --git a/test_unstructured/partition/test_strategies.py b/test_unstructured/partition/test_strategies.py index fbc580bda8..9c66076eab 100644 --- a/test_unstructured/partition/test_strategies.py +++ b/test_unstructured/partition/test_strategies.py @@ -2,6 +2,7 @@ import pytest +from unstructured.documents.elements import Text from unstructured.partition import pdf, strategies from unstructured.partition.utils.constants import PartitionStrategy @@ -46,11 +47,17 @@ def test_is_pdf_text_extractable(filename, from_file, expected): if from_file: with open(filename, "rb") as f: - extractable = pdf.extractable_elements(file=f) + extracted_elements = pdf.extractable_elements(file=f) else: - extractable = pdf.extractable_elements(filename=filename) + extracted_elements = pdf.extractable_elements(filename=filename) - assert bool(extractable) is expected + pdf_text_extractable = any( + isinstance(el, Text) and el.text.strip() + for page_elements in extracted_elements + for el in page_elements + ) + + assert pdf_text_extractable is expected @pytest.mark.parametrize(