Skip to content

Commit

Permalink
enhancement: partitoin_pdf() skip unnecessary element sorting (#3030)
Browse files Browse the repository at this point in the history
This PR aims to skip element sorting when determining whether embedded
text can be extracted. The extracted elements in this step are returned
as final elements only for the `fast` strategy pipeline and are never
used for other strategy pipelines (`hi_res`, `ocr`).
Removing element sorting in this step and adding it to the `fast`
strategy pipeline later will improve performance and reduce execution
time.

### Summary
- skip element sorting when determining whether embedded text can be
extracted.
- add `_partition_pdf_with_pdfparser()` function for fast` strategy
pipeline

### Testing
CI should pass.
  • Loading branch information
christinestraub authored May 16, 2024
1 parent ecdfb7a commit 1fb0fe5
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 33 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
## 0.13.8-dev11
## 0.13.8-dev12

### Enhancements

* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
* **Faster evaluation** Support for concurrent processing of documents during evaluation
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.

Expand Down
4 changes: 2 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def mock_exists(dep):

monkeypatch.setattr(strategies, "dependency_exists", mock_exists)

mock_return = [Text("Hello there!")]
mock_return = [[Text("Hello there!")], []]
with mock.patch.object(
pdf,
"extractable_elements",
Expand All @@ -405,7 +405,7 @@ def mock_exists(dep):

monkeypatch.setattr(strategies, "dependency_exists", mock_exists)

mock_return = [Text("Hello there!")]
mock_return = [[Text("Hello there!")], []]
with mock.patch.object(
pdf,
"extractable_elements",
Expand Down
13 changes: 10 additions & 3 deletions test_unstructured/partition/test_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest

from unstructured.documents.elements import Text
from unstructured.partition import pdf, strategies
from unstructured.partition.utils.constants import PartitionStrategy

Expand Down Expand Up @@ -46,11 +47,17 @@ def test_is_pdf_text_extractable(filename, from_file, expected):

if from_file:
with open(filename, "rb") as f:
extractable = pdf.extractable_elements(file=f)
extracted_elements = pdf.extractable_elements(file=f)
else:
extractable = pdf.extractable_elements(filename=filename)
extracted_elements = pdf.extractable_elements(filename=filename)

assert bool(extractable) is expected
pdf_text_extractable = any(
isinstance(el, Text) and el.text.strip()
for page_elements in extracted_elements
for el in page_elements
)

assert pdf_text_extractable is expected


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.8-dev11" # pragma: no cover
__version__ = "0.13.8-dev12" # pragma: no cover
64 changes: 38 additions & 26 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,14 +259,15 @@ def partition_pdf_or_image(
extracted_elements = extractable_elements(
filename=filename,
file=spooled_to_bytes_io_if_needed(file),
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date,
starting_page_number=starting_page_number,
**kwargs,
)
pdf_text_extractable = any(
isinstance(el, Text) and el.text.strip() for el in extracted_elements
isinstance(el, Text) and el.text.strip()
for page_elements in extracted_elements
for el in page_elements
)
except Exception as e:
logger.error(e)
Expand All @@ -285,7 +286,7 @@ def partition_pdf_or_image(
file.seek(0)

if strategy == PartitionStrategy.HI_RES:
# NOTE(robinson): Catches a UserWarning that occurs when detectron is called
# NOTE(robinson): Catches a UserWarning that occurs when detection is called
with warnings.catch_warnings():
warnings.simplefilter("ignore")
elements = _partition_pdf_or_image_local(
Expand All @@ -308,7 +309,12 @@ def partition_pdf_or_image(
out_elements = _process_uncategorized_text_elements(elements)

elif strategy == PartitionStrategy.FAST:
return extracted_elements
out_elements = _partition_pdf_with_pdfparser(
extracted_elements=extracted_elements,
include_page_breaks=include_page_breaks,
)

return out_elements

elif strategy == PartitionStrategy.OCR_ONLY:
# NOTE(robinson): Catches file conversion warnings when running with PDFs
Expand All @@ -331,18 +337,16 @@ def partition_pdf_or_image(
def extractable_elements(
filename: str = "",
file: Optional[bytes | IO[bytes]] = None,
include_page_breaks: bool = False,
languages: Optional[list[str]] = None,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
**kwargs: Any,
):
) -> list[list[Element]]:
if isinstance(file, bytes):
file = io.BytesIO(file)
return _partition_pdf_with_pdfminer(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
Expand Down Expand Up @@ -600,12 +604,11 @@ def _process_uncategorized_text_elements(elements: list[Element]):
def _partition_pdf_with_pdfminer(
filename: str,
file: Optional[IO[bytes]],
include_page_breaks: bool,
languages: list[str],
metadata_last_modified: Optional[str],
starting_page_number: int = 1,
**kwargs: Any,
) -> list[Element]:
) -> list[list[Element]]:
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
processing or detectron2 is not available.
Expand All @@ -624,7 +627,6 @@ def _partition_pdf_with_pdfminer(
elements = _process_pdfminer_pages(
fp=fp,
filename=filename,
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
Expand All @@ -635,7 +637,6 @@ def _partition_pdf_with_pdfminer(
elements = _process_pdfminer_pages(
fp=file,
filename=filename,
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
Expand Down Expand Up @@ -681,17 +682,15 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
def _process_pdfminer_pages(
fp: IO[bytes],
filename: str,
include_page_breaks: bool,
languages: list[str],
metadata_last_modified: Optional[str],
sort_mode: str = SORT_MODE_XY_CUT,
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
starting_page_number: int = 1,
**kwargs,
):
) -> list[list[Element]]:
"""Uses PDFMiner to split a document into pages and process them."""

elements: list[Element] = []
elements = []

for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(fp), start=starting_page_number
Expand Down Expand Up @@ -758,17 +757,7 @@ def _process_pdfminer_pages(
page_elements.append(element)

page_elements = _combine_list_elements(page_elements, coordinate_system)

# NOTE(crag, christine): always do the basic sort first for determinsitic order across
# python versions.
sorted_page_elements = sort_page_elements(page_elements, SORT_MODE_BASIC)
if sort_mode != SORT_MODE_BASIC:
sorted_page_elements = sort_page_elements(sorted_page_elements, sort_mode)

elements += sorted_page_elements

if include_page_breaks:
elements.append(PageBreak(text=""))
elements.append(page_elements)

return elements

Expand Down Expand Up @@ -849,6 +838,29 @@ def _combine_coordinates_into_element1(
return copy.deepcopy(element1)


def _partition_pdf_with_pdfparser(
extracted_elements: list[list[Element]],
include_page_breaks: bool = False,
sort_mode: str = SORT_MODE_XY_CUT,
):
"""Partitions a PDF using pdfparser."""
elements = []

for page_elements in extracted_elements:
# NOTE(crag, christine): always do the basic sort first for deterministic order across
# python versions.
sorted_page_elements = sort_page_elements(page_elements, SORT_MODE_BASIC)
if sort_mode != SORT_MODE_BASIC:
sorted_page_elements = sort_page_elements(sorted_page_elements, sort_mode)

elements += sorted_page_elements

if include_page_breaks:
elements.append(PageBreak(text=""))

return elements


def convert_pdf_to_images(
filename: str = "",
file: Optional[bytes | IO[bytes]] = None,
Expand Down

0 comments on commit 1fb0fe5

Please sign in to comment.