Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add password with PDF files #392

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added sample-docs/password.pdf
Binary file not shown.
20 changes: 20 additions & 0 deletions test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,25 @@ def mock_get_elements(self, *args, **kwargs):
assert page.image is None


@pytest.mark.slow()
def test_from_file_with_password(monkeypatch, mock_final_layout):

doc = layout.DocumentLayout.from_file(
"sample-docs/password.pdf",
password="password")
assert doc

monkeypatch.setattr(layout, "get_model",
lambda x: MockLayoutModel(mock_final_layout))
with patch(
"unstructured_inference.inference.layout.UnstructuredObjectDetectionModel",
MockLayoutModel,
), open("sample-docs/password.pdf",mode="rb") as fp:
doc = layout.process_data_with_model(fp, model_name="fake", password="password")
assert doc



def test_from_image_file_raises_with_empty_fn():
with pytest.raises(FileNotFoundError):
layout.DocumentLayout.from_image_file("")
Expand Down Expand Up @@ -544,6 +563,7 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
detection_model=detection_model,
element_extraction_model=element_extraction_model,
fixed_layouts=None,
password=None,
pdf_image_dpi=200,
)

Expand Down
11 changes: 11 additions & 0 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def from_file(
filename: str,
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
pdf_image_dpi: int = 200,
password:Optional[str] = None,
**kwargs,
) -> DocumentLayout:
"""Creates a DocumentLayout from a pdf file."""
Expand All @@ -62,6 +63,7 @@ def from_file(
pdf_image_dpi,
output_folder=temp_dir,
path_only=True,
password=password,
)
image_paths = cast(List[str], _image_paths)
number_of_pages = len(image_paths)
Expand Down Expand Up @@ -133,6 +135,7 @@ def __init__(
document_filename: Optional[Union[str, PurePath]] = None,
detection_model: Optional[UnstructuredObjectDetectionModel] = None,
element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
password:Optional[str] = None,
):
if detection_model is not None and element_extraction_model is not None:
raise ValueError("Only one of detection_model and extraction_model should be passed.")
Expand All @@ -148,6 +151,7 @@ def __init__(
self.element_extraction_model = element_extraction_model
self.elements: Collection[LayoutElement] = []
self.elements_array: LayoutElements | None = None
self.password = password
# NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has
# locations now and if we need to support LayoutElements without bounding boxes we can make
# the bbox property optional
Expand Down Expand Up @@ -325,6 +329,7 @@ def from_image(
def process_data_with_model(
data: BinaryIO,
model_name: Optional[str],
password: Optional[str] = None,
**kwargs: Any,
) -> DocumentLayout:
"""Process PDF as file-like object `data` into a `DocumentLayout`.
Expand All @@ -339,6 +344,7 @@ def process_data_with_model(
layout = process_file_with_model(
file_path,
model_name,
password=password,
**kwargs,
)

Expand All @@ -351,6 +357,7 @@ def process_file_with_model(
is_image: bool = False,
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
pdf_image_dpi: int = 200,
password: Optional[str] = None,
**kwargs: Any,
) -> DocumentLayout:
"""Processes pdf file with name filename into a DocumentLayout by using a model identified by
Expand Down Expand Up @@ -379,6 +386,7 @@ def process_file_with_model(
element_extraction_model=element_extraction_model,
fixed_layouts=fixed_layouts,
pdf_image_dpi=pdf_image_dpi,
password=password,
**kwargs,
)
)
Expand All @@ -390,6 +398,7 @@ def convert_pdf_to_image(
dpi: int = 200,
output_folder: Optional[Union[str, PurePath]] = None,
path_only: bool = False,
password: Optional[str] = None,
) -> Union[List[Image.Image], List[str]]:
"""Get the image renderings of the pdf pages using pdf2image"""

Expand All @@ -402,12 +411,14 @@ def convert_pdf_to_image(
dpi=dpi,
output_folder=output_folder,
paths_only=path_only,
userpw=password,
)
else:
images = pdf2image.convert_from_path(
filename,
dpi=dpi,
paths_only=path_only,
userpw=password,
)

return images