Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enhancement: partitoin_pdf() skip unnecessary element sorting #3030

Merged
merged 8 commits into from
May 16, 2024
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
## 0.13.8-dev11
## 0.13.8-dev12

### Enhancements

* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
* **Faster evaluation** Support for concurrent processing of documents during evaluation
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.

Expand Down
4 changes: 2 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def mock_exists(dep):

monkeypatch.setattr(strategies, "dependency_exists", mock_exists)

mock_return = [Text("Hello there!")]
mock_return = [[Text("Hello there!")], []]
with mock.patch.object(
pdf,
"extractable_elements",
Expand All @@ -405,7 +405,7 @@ def mock_exists(dep):

monkeypatch.setattr(strategies, "dependency_exists", mock_exists)

mock_return = [Text("Hello there!")]
mock_return = [[Text("Hello there!")], []]
with mock.patch.object(
pdf,
"extractable_elements",
Expand Down
13 changes: 10 additions & 3 deletions test_unstructured/partition/test_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest

from unstructured.documents.elements import Text
from unstructured.partition import pdf, strategies
from unstructured.partition.utils.constants import PartitionStrategy

Expand Down Expand Up @@ -46,11 +47,17 @@ def test_is_pdf_text_extractable(filename, from_file, expected):

if from_file:
with open(filename, "rb") as f:
extractable = pdf.extractable_elements(file=f)
extracted_elements = pdf.extractable_elements(file=f)
else:
extractable = pdf.extractable_elements(filename=filename)
extracted_elements = pdf.extractable_elements(filename=filename)

assert bool(extractable) is expected
pdf_text_extractable = any(
isinstance(el, Text) and el.text.strip()
for page_elements in extracted_elements
for el in page_elements
)

assert pdf_text_extractable is expected


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.8-dev11" # pragma: no cover
__version__ = "0.13.8-dev12" # pragma: no cover
64 changes: 38 additions & 26 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,14 +259,15 @@ def partition_pdf_or_image(
extracted_elements = extractable_elements(
filename=filename,
file=spooled_to_bytes_io_if_needed(file),
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date,
starting_page_number=starting_page_number,
**kwargs,
)
pdf_text_extractable = any(
isinstance(el, Text) and el.text.strip() for el in extracted_elements
isinstance(el, Text) and el.text.strip()
for page_elements in extracted_elements
for el in page_elements
)
except Exception as e:
logger.error(e)
Expand All @@ -285,7 +286,7 @@ def partition_pdf_or_image(
file.seek(0)

if strategy == PartitionStrategy.HI_RES:
# NOTE(robinson): Catches a UserWarning that occurs when detectron is called
# NOTE(robinson): Catches a UserWarning that occurs when detection is called
with warnings.catch_warnings():
warnings.simplefilter("ignore")
elements = _partition_pdf_or_image_local(
Expand All @@ -308,7 +309,12 @@ def partition_pdf_or_image(
out_elements = _process_uncategorized_text_elements(elements)

elif strategy == PartitionStrategy.FAST:
return extracted_elements
out_elements = _partition_pdf_with_pdfparser(
extracted_elements=extracted_elements,
include_page_breaks=include_page_breaks,
)

return out_elements

elif strategy == PartitionStrategy.OCR_ONLY:
# NOTE(robinson): Catches file conversion warnings when running with PDFs
Expand All @@ -331,18 +337,16 @@ def partition_pdf_or_image(
def extractable_elements(
filename: str = "",
file: Optional[bytes | IO[bytes]] = None,
include_page_breaks: bool = False,
languages: Optional[list[str]] = None,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
**kwargs: Any,
):
) -> list[list[Element]]:
if isinstance(file, bytes):
file = io.BytesIO(file)
return _partition_pdf_with_pdfminer(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
Expand Down Expand Up @@ -600,12 +604,11 @@ def _process_uncategorized_text_elements(elements: list[Element]):
def _partition_pdf_with_pdfminer(
filename: str,
file: Optional[IO[bytes]],
include_page_breaks: bool,
languages: list[str],
metadata_last_modified: Optional[str],
starting_page_number: int = 1,
**kwargs: Any,
) -> list[Element]:
) -> list[list[Element]]:
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
processing or detectron2 is not available.

Expand All @@ -624,7 +627,6 @@ def _partition_pdf_with_pdfminer(
elements = _process_pdfminer_pages(
fp=fp,
filename=filename,
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
Expand All @@ -635,7 +637,6 @@ def _partition_pdf_with_pdfminer(
elements = _process_pdfminer_pages(
fp=file,
filename=filename,
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
Expand Down Expand Up @@ -681,17 +682,15 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
def _process_pdfminer_pages(
fp: IO[bytes],
filename: str,
include_page_breaks: bool,
languages: list[str],
metadata_last_modified: Optional[str],
sort_mode: str = SORT_MODE_XY_CUT,
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
starting_page_number: int = 1,
**kwargs,
):
) -> list[list[Element]]:
"""Uses PDFMiner to split a document into pages and process them."""

elements: list[Element] = []
elements = []

for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(fp), start=starting_page_number
Expand Down Expand Up @@ -758,17 +757,7 @@ def _process_pdfminer_pages(
page_elements.append(element)

page_elements = _combine_list_elements(page_elements, coordinate_system)

# NOTE(crag, christine): always do the basic sort first for determinsitic order across
# python versions.
sorted_page_elements = sort_page_elements(page_elements, SORT_MODE_BASIC)
if sort_mode != SORT_MODE_BASIC:
sorted_page_elements = sort_page_elements(sorted_page_elements, sort_mode)

elements += sorted_page_elements

if include_page_breaks:
elements.append(PageBreak(text=""))
elements.append(page_elements)

return elements

Expand Down Expand Up @@ -849,6 +838,29 @@ def _combine_coordinates_into_element1(
return copy.deepcopy(element1)


def _partition_pdf_with_pdfparser(
extracted_elements: list[list[Element]],
include_page_breaks: bool = False,
sort_mode: str = SORT_MODE_XY_CUT,
):
"""Partitions a PDF using pdfparser."""
elements = []

for page_elements in extracted_elements:
# NOTE(crag, christine): always do the basic sort first for deterministic order across
# python versions.
sorted_page_elements = sort_page_elements(page_elements, SORT_MODE_BASIC)
if sort_mode != SORT_MODE_BASIC:
sorted_page_elements = sort_page_elements(sorted_page_elements, sort_mode)

elements += sorted_page_elements

if include_page_breaks:
elements.append(PageBreak(text=""))

return elements


def convert_pdf_to_images(
filename: str = "",
file: Optional[bytes | IO[bytes]] = None,
Expand Down
Loading