Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add password with PDF files #3721

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,11 @@

### Fixes

* **Use password** to load PDF with all modes
* **V2 elements without first parent ID can be parsed**
* **Fix missing elements when layout element parsed in V2 ontology**
* updated **unstructured-inference** to be **0.8.1** in requirements/extra-pdf-image.in


## 0.16.2

### Enhancements
Expand Down
Binary file added example-docs/pdf/password.pdf
Binary file not shown.
48 changes: 48 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1517,3 +1517,51 @@ def test_document_to_element_list_sets_category_depth_titles():
assert elements[1].metadata.category_depth == 2
assert elements[2].metadata.category_depth is None
assert elements[3].metadata.category_depth == 0

@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
("strategy", "origin"),
# fast: can't capture the "intentionally left blank page" page
# others: will ignore the actual blank page
[
(PartitionStrategy.FAST, {"pdfminer"}),
(PartitionStrategy.FAST, {"pdfminer"}),
(PartitionStrategy.HI_RES, {"yolox", "pdfminer", "ocr_tesseract"}),
(PartitionStrategy.OCR_ONLY, {"ocr_tesseract"}),
],
)
def test_partition_pdf_with_password(
file_mode,
strategy,
origin,
filename=example_doc_path("pdf/password.pdf"),
):
# Test that the partition_pdf function can handle filename
def _test(result):
# validate that the result is a non-empty list of dicts
assert len(result) == 1
assert result[0].text == 'File with password'

if file_mode == "filename":
result = pdf.partition_pdf(
filename=filename, strategy=strategy,
password="password"
)
_test(result)
elif file_mode == "rb":
with open(filename, "rb") as f:
result = pdf.partition_pdf(
file=f, strategy=strategy,
password="password"
)
_test(result)
else:
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy,
password="password"
)
_test(result)
4 changes: 4 additions & 0 deletions unstructured/partition/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def partition_image(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password:Optional[str]=None,
**kwargs: Any,
) -> list[Element]:
"""Parses an image into a list of interpreted elements.
Expand Down Expand Up @@ -91,6 +92,8 @@ def partition_image(
(results in adding FormKeysValues elements to output).
form_extraction_skip_tables
Whether the form extraction logic should ignore regions designated as Tables.
password
The password to decrypt the PDF file.
"""
exactly_one(filename=filename, file=file)

Expand All @@ -113,5 +116,6 @@ def partition_image(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)
50 changes: 45 additions & 5 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
import numpy as np
import wrapt
from pdfminer import psparser
from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox
from pdfminer.pdftypes import PDFObjRef
from pdfminer.utils import open_filename
from pi_heif import register_heif_opener
from PIL import Image as PILImage
Expand Down Expand Up @@ -142,6 +143,7 @@ def partition_pdf(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses a pdf document into a list of interpreted elements.
Expand Down Expand Up @@ -222,6 +224,7 @@ def partition_pdf(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)

Expand All @@ -243,6 +246,7 @@ def partition_pdf_or_image(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
Expand Down Expand Up @@ -271,6 +275,7 @@ def partition_pdf_or_image(
languages=languages,
metadata_last_modified=metadata_last_modified or last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)
pdf_text_extractable = any(
Expand Down Expand Up @@ -320,6 +325,7 @@ def partition_pdf_or_image(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)
Expand All @@ -345,6 +351,7 @@ def partition_pdf_or_image(
is_image=is_image,
metadata_last_modified=metadata_last_modified or last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)
Expand All @@ -358,6 +365,7 @@ def extractable_elements(
languages: Optional[list[str]] = None,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
password:Optional[str] = None,
**kwargs: Any,
) -> list[list[Element]]:
if isinstance(file, bytes):
Expand All @@ -368,6 +376,7 @@ def extractable_elements(
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)

Expand All @@ -378,6 +387,7 @@ def _partition_pdf_with_pdfminer(
languages: list[str],
metadata_last_modified: Optional[str],
starting_page_number: int = 1,
password:Optional[str] = None,
**kwargs: Any,
) -> list[list[Element]]:
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
Expand All @@ -401,6 +411,7 @@ def _partition_pdf_with_pdfminer(
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)

Expand All @@ -411,6 +422,7 @@ def _partition_pdf_with_pdfminer(
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)

Expand All @@ -425,14 +437,16 @@ def _process_pdfminer_pages(
metadata_last_modified: Optional[str],
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs,
) -> list[list[Element]]:
"""Uses PDFMiner to split a document into pages and process them."""

elements = []

for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(fp), start=starting_page_number
open_pdfminer_pages_generator(fp, password=password),
start=starting_page_number,
):
width, height = page_layout.width, page_layout.height

Expand Down Expand Up @@ -554,6 +568,7 @@ def _partition_pdf_or_image_local(
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
pdf_hi_res_max_pages: Optional[int] = None,
password:Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partition using package installed locally"""
Expand Down Expand Up @@ -590,10 +605,12 @@ def _partition_pdf_or_image_local(
is_image=is_image,
model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi,
password=password,
)

extracted_layout, layouts_links = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi,
password=password)
if pdf_text_extractable
else ([], [])
)
Expand Down Expand Up @@ -633,20 +650,22 @@ def _partition_pdf_or_image_local(
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
password=password,
)
else:
inferred_document_layout = process_data_with_model(
file,
is_image=is_image,
model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi,
password=password,
)

if hasattr(file, "seek"):
file.seek(0)

extracted_layout, layouts_links = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password)
if pdf_text_extractable
else ([], [])
)
Expand Down Expand Up @@ -688,6 +707,7 @@ def _partition_pdf_or_image_local(
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
password=password,
)

final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
Expand Down Expand Up @@ -834,6 +854,7 @@ def _partition_pdf_or_image_with_ocr(
is_image: bool = False,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any,
):
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
Expand All @@ -858,7 +879,8 @@ def _partition_pdf_or_image_with_ocr(
elements.extend(page_elements)
else:
for page_number, image in enumerate(
convert_pdf_to_images(filename, file), start=starting_page_number
convert_pdf_to_images(filename, file, password=password),
start=starting_page_number
):
page_elements = _partition_pdf_or_image_with_ocr_from_image(
image=image,
Expand Down Expand Up @@ -1142,6 +1164,24 @@ def document_to_element_list(
page_elements.extend(element)
translation_mapping.extend([(layout_element, el) for el in element])
continue

# TODO(klaijan) - isalnum() only works with A-Z, a-z and 0-9
# will need to switch to some pattern matching once we support more languages
if not word:
isalnum = char.isalnum()
if word and char.isalnum() != isalnum:
isalnum = char.isalnum()
words.append(
{"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index},
)
word = ""

if len(word) == 0:
start_index = text_len + index
x1 = character.x0
y2 = height - character.y0
x2 = character.x1
y1 = height - character.y1
else:

element.metadata.links = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,7 @@ def __init__(
draw_grid: bool = False,
resize: Optional[float] = None,
format: str = "png",
password: Optional[str] = None,
):
self.draw_caption = draw_caption
self.draw_grid = draw_grid
Expand All @@ -554,6 +555,7 @@ def __init__(
self.format = format
self.drawers = []
self.file = file
self.password = password

super().__init__(filename, save_dir)

Expand Down Expand Up @@ -678,6 +680,7 @@ def load_source_image(self) -> Generator[Image.Image, None, None]:
file=self.file,
output_folder=temp_dir,
path_only=True,
password=self.password,
)
except Exception as ex: # noqa: E722
print(
Expand Down
6 changes: 6 additions & 0 deletions unstructured/partition/pdf_image/analysis/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def save_analysis_artifiacts(
draw_caption: bool = True,
resize: Optional[float] = None,
format: str = "png",
password: Optional[str] = None,
):
"""Save the analysis artifacts for a given file. Loads some settings from
the environment configuration.
Expand All @@ -82,6 +83,7 @@ def save_analysis_artifiacts(
draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
resize: Output image resize value. If not provided, the image will not be resized.
format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
password (optional): The password to decrypt the PDF file.
"""
if not filename:
filename = _generate_filename(is_image)
Expand Down Expand Up @@ -109,6 +111,7 @@ def save_analysis_artifiacts(
draw_caption=draw_caption,
resize=resize,
format=format,
password=password,
)

for layout_dumper in layout_dumpers:
Expand All @@ -125,6 +128,7 @@ def render_bboxes_for_file(
draw_caption: bool = True,
resize: Optional[float] = None,
format: str = "png",
password: Optional[str] = None,
):
"""Render the bounding boxes for a given layout dimp file.
To be used for analysis after the partition is performed for
Expand All @@ -144,6 +148,7 @@ def render_bboxes_for_file(
draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
resize: Output image resize value. If not provided, the image will not be resized.
format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
password (optional): The password to decrypt the PDF file.
"""
filename_stem = Path(filename).stem
is_image = not Path(filename).suffix.endswith("pdf")
Expand Down Expand Up @@ -183,6 +188,7 @@ def render_bboxes_for_file(
draw_caption=draw_caption,
resize=resize,
format=format,
password=password,
)

for drawer in layout_drawers:
Expand Down
Loading