Skip to content

Commit

Permalink
fix: fix IndexError when partioning a pdf with `starting_page_numbe…
Browse files Browse the repository at this point in the history
…r` (#3246)

The Issue:

When extracting images from pdfs, we use the metadata page number to
index into a list of the images. However, the metadata page number can
now be changed via `starting_page_number`. To get the true page index,
we need to subtract this value.

Testing:

Run this snippet in a python shell. Before the fix, this throws an
IndexError. On this branch, it will return the elements.
```
from unstructured.partition.auto import partition
filename = "example-docs/layout-parser-paper-with-table.pdf"
partition(filename, strategy="hi_res", extract_image_block_types=["Image", "Table"], starting_page_number=20)
```

---------

Co-authored-by: Matt Robinson <[email protected]>
Co-authored-by: christinestraub <[email protected]>
  • Loading branch information
3 people authored Jun 19, 2024
1 parent c3af03d commit 0b73978
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 6 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.14.7-dev4
## 0.14.7-dev5

### Enhancements

Expand All @@ -12,6 +12,7 @@
### Fixes

* **Fix an error publishing docker images.** Update user in docker-smoke-test to reflect changes made by the amd64 image pull from the "unstructured" "wolfi-base" image.
* **Fix a IndexError when partitioning a pdf with values for both `extract_image_block_types` and `starting_page_number`.

## 0.14.6

Expand Down
6 changes: 5 additions & 1 deletion test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1223,6 +1223,8 @@ def test_partition_pdf_element_extraction(
if file_mode == "filename":
elements = pdf.partition_pdf(
filename=filename,
# Image extraction shouldn't break by setting this
starting_page_number=20,
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
Expand All @@ -1231,11 +1233,13 @@ def test_partition_pdf_element_extraction(
with open(filename, "rb") as f:
elements = pdf.partition_pdf(
file=f,
# Image extraction shouldn't break by setting this
starting_page_number=20,
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
)

assert elements[0].metadata.page_number == 20
assert_element_extraction(
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
)
Expand Down
2 changes: 2 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def test_save_elements(

pdf_image_utils.save_elements(
elements=elements,
starting_page_number=1,
element_category_to_save=element_category_to_save,
pdf_image_dpi=200,
filename=filename,
Expand Down Expand Up @@ -157,6 +158,7 @@ def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
pdf_image_utils.save_elements(
elements=[],
element_category_to_save="",
starting_page_number=1,
pdf_image_dpi=200,
filename="dummy.pdf",
output_dir_path=None,
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.14.7-dev4" # pragma: no cover
__version__ = "0.14.7-dev5" # pragma: no cover
2 changes: 2 additions & 0 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,7 @@ def _partition_pdf_or_image_local(
if extract_images_in_pdf:
save_elements(
elements=elements,
starting_page_number=starting_page_number,
element_category_to_save=ElementType.IMAGE,
filename=filename,
file=file,
Expand All @@ -675,6 +676,7 @@ def _partition_pdf_or_image_local(

save_elements(
elements=elements,
starting_page_number=starting_page_number,
element_category_to_save=el_type,
filename=filename,
file=file,
Expand Down
12 changes: 9 additions & 3 deletions unstructured/partition/pdf_image/pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def pad_bbox(

def save_elements(
elements: List["Element"],
starting_page_number: int,
element_category_to_save: str,
pdf_image_dpi: int,
filename: str = "",
Expand Down Expand Up @@ -183,16 +184,21 @@ def save_elements(
padded_bbox = cast(
Tuple[int, int, int, int], pad_bbox((x1, y1, x2, y2), (h_padding, v_padding))
)
page_number = el.metadata.page_number

# The page number in the metadata may have been offset
# by starting_page_number. Make sure we use the right
# value for indexing!
metadata_page_number = el.metadata.page_number
page_index = metadata_page_number - starting_page_number

figure_number += 1
try:
basename = "table" if el.category == ElementType.TABLE else "figure"
output_f_path = os.path.join(
output_dir_path,
f"{basename}-{page_number}-{figure_number}.jpg",
f"{basename}-{metadata_page_number}-{figure_number}.jpg",
)
image_path = image_paths[page_number - 1]
image_path = image_paths[page_index]
image = Image.open(image_path)
cropped_image = image.crop(padded_bbox)
if extract_image_block_to_payload:
Expand Down

0 comments on commit 0b73978

Please sign in to comment.