Skip to content

Commit

Permalink
ENH: Support alternative (U)F names for embedded file retrieval (#3072)
Browse files Browse the repository at this point in the history
Closes #3070.
  • Loading branch information
stefan6419846 authored Jan 26, 2025
1 parent ad97deb commit b94d203
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 18 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/github-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
sudo apt-get update
- name: Install APT dependencies
run:
sudo apt-get install ghostscript
sudo apt-get install ghostscript poppler-utils
- name: Checkout Code
uses: actions/checkout@v4
with:
Expand Down
66 changes: 49 additions & 17 deletions pypdf/_doc_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1350,7 +1350,8 @@ def _list_attachments(self) -> List[str]:
catalog = self.root_object
# From the catalog get the embedded file names
try:
filenames = cast(
# This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
names = cast(
ArrayObject,
cast(
DictionaryObject,
Expand All @@ -1359,8 +1360,23 @@ def _list_attachments(self) -> List[str]:
)
except KeyError:
return []
attachments_names = [f for f in filenames if isinstance(f, str)]
return attachments_names
attachment_names: List[str] = []
for i, name in enumerate(names):
if isinstance(name, str):
attachment_names.append(name)
else:
name = name.get_object()
for key in ["/UF", "/F"]:
# PDF 2.0 reference, table 43:
# > A PDF reader shall use the value of the UF key, when present, instead of the F key.
if key in name:
name = name[key].get_object()
if name == names[i - 1]:
# Avoid duplicates for the same entry.
continue
attachment_names.append(name)
break
return attachment_names

def _get_attachment_list(self, name: str) -> List[bytes]:
out = self._get_attachments(name)[name]
Expand Down Expand Up @@ -1389,7 +1405,8 @@ def _get_attachments(
catalog = self.root_object
# From the catalog get the embedded file names
try:
filenames = cast(
# This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
names = cast(
ArrayObject,
cast(
DictionaryObject,
Expand All @@ -1399,21 +1416,36 @@ def _get_attachments(
except KeyError:
return {}
attachments: Dict[str, Union[bytes, List[bytes]]] = {}

# Loop through attachments
for i in range(len(filenames)):
f = filenames[i]
if isinstance(f, str):
if filename is not None and f != filename:
continue
name = f
f_dict = filenames[i + 1].get_object()
f_data = f_dict["/EF"]["/F"].get_data()
if name in attachments:
if not isinstance(attachments[name], list):
attachments[name] = [attachments[name]] # type:ignore
attachments[name].append(f_data) # type:ignore
for i, name in enumerate(names):
if isinstance(name, str):
# Retrieve the corresponding reference.
file_dictionary = names[i + 1].get_object()
else:
# We have the reference, but need to determine the name.
file_dictionary = name.get_object()
for key in ["/UF", "/F"]:
# PDF 2.0 reference, table 43:
# > A PDF reader shall use the value of the UF key, when present, instead of the F key.
if key in file_dictionary:
name = file_dictionary[key].get_object()
break
else:
attachments[name] = f_data
continue
if name == names[i - 1]:
# Avoid extracting the same file twice.
continue

if filename is not None and name != filename:
continue
file_data = file_dictionary["/EF"]["/F"].get_data()
if name in attachments:
if not isinstance(attachments[name], list):
attachments[name] = [attachments[name]] # type:ignore
attachments[name].append(file_data) # type:ignore
else:
attachments[name] = file_data
return attachments

@abstractmethod
Expand Down
71 changes: 71 additions & 0 deletions tests/test_doc_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Test the pypdf._doc_common module."""
import re
import shutil
import subprocess
from pathlib import Path

import pytest

from pypdf import PdfReader, PdfWriter

TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT.parent
SAMPLE_ROOT = PROJECT_ROOT / "sample-files"

PDFATTACH_BINARY = shutil.which("pdfattach")


@pytest.mark.skipif(PDFATTACH_BINARY is None, reason="Requires poppler-utils")
def test_attachments(tmpdir):
# No attachments.
clean_path = SAMPLE_ROOT / "002-trivial-libre-office-writer" / "002-trivial-libre-office-writer.pdf"
with PdfReader(clean_path) as pdf:
assert pdf._list_attachments() == []

# UF = name.
attached_path = tmpdir / "attached.pdf"
file_path = tmpdir / "test.txt"
file_path.write_binary(b"Hello World\n")
subprocess.run([PDFATTACH_BINARY, clean_path, file_path, attached_path]) # noqa: S603
with PdfReader(str(attached_path)) as pdf:
assert pdf._list_attachments() == ["test.txt"]
assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}

# UF != name.
different_path = tmpdir / "different.pdf"
different_path.write_binary(re.sub(rb" /UF [^/]+ /", b" /UF(my-file.txt) /", attached_path.read_binary()))
with PdfReader(str(different_path)) as pdf:
assert pdf._list_attachments() == ["test.txt", "my-file.txt"]
assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}
assert pdf._get_attachments("my-file.txt") == {"my-file.txt": b"Hello World\n"}

# Only name.
no_f_path = tmpdir / "no-f.pdf"
no_f_path.write_binary(re.sub(rb" /UF [^/]+ /", b" /", attached_path.read_binary()))
with PdfReader(str(no_f_path)) as pdf:
assert pdf._list_attachments() == ["test.txt"]
assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}

# UF and F.
uf_f_path = tmpdir / "uf-f.pdf"
uf_f_path.write_binary(attached_path.read_binary().replace(b" /UF ", b"/F(file.txt) /UF "))
with PdfReader(str(uf_f_path)) as pdf:
assert pdf._list_attachments() == ["test.txt"]
assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}

# Only F.
only_f_path = tmpdir / "f.pdf"
only_f_path.write_binary(attached_path.read_binary().replace(b" /UF ", b" /F "))
with PdfReader(str(only_f_path)) as pdf:
assert pdf._list_attachments() == ["test.txt"]
assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}


def test_get_attachments__same_attachment_more_than_twice():
writer = PdfWriter()
writer.add_blank_page(100, 100)
for i in range(5):
writer.add_attachment("test.txt", f"content{i}")
assert writer._get_attachments("test.txt") == {
"test.txt": [b"content0", b"content1", b"content2", b"content3", b"content4"]
}

0 comments on commit b94d203

Please sign in to comment.