ENH: Support alternative (U)F names for embedded file retrieval (#3072)

Closes #3070.
py-pdf · Jan 26, 2025 · b94d203 · b94d203
1 parent ad97deb
commit b94d203
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 18 deletions.
diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml
@@ -70,7 +70,7 @@ jobs:
         sudo apt-get update
     - name: Install APT dependencies
       run:
-        sudo apt-get install ghostscript
+        sudo apt-get install ghostscript poppler-utils
     - name: Checkout Code
       uses: actions/checkout@v4
       with:

diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
@@ -1350,7 +1350,8 @@ def _list_attachments(self) -> List[str]:
         catalog = self.root_object
         # From the catalog get the embedded file names
         try:
-            filenames = cast(
+            # This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
+            names = cast(
                 ArrayObject,
                 cast(
                     DictionaryObject,
@@ -1359,8 +1360,23 @@ def _list_attachments(self) -> List[str]:
             )
         except KeyError:
             return []
-        attachments_names = [f for f in filenames if isinstance(f, str)]
-        return attachments_names
+        attachment_names: List[str] = []
+        for i, name in enumerate(names):
+            if isinstance(name, str):
+                attachment_names.append(name)
+            else:
+                name = name.get_object()
+                for key in ["/UF", "/F"]:
+                    # PDF 2.0 reference, table 43:
+                    #   > A PDF reader shall use the value of the UF key, when present, instead of the F key.
+                    if key in name:
+                        name = name[key].get_object()
+                        if name == names[i - 1]:
+                            # Avoid duplicates for the same entry.
+                            continue
+                        attachment_names.append(name)
+                    break
+        return attachment_names
 
     def _get_attachment_list(self, name: str) -> List[bytes]:
         out = self._get_attachments(name)[name]
@@ -1389,7 +1405,8 @@ def _get_attachments(
         catalog = self.root_object
         # From the catalog get the embedded file names
         try:
-            filenames = cast(
+            # This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
+            names = cast(
                 ArrayObject,
                 cast(
                     DictionaryObject,
@@ -1399,21 +1416,36 @@ def _get_attachments(
         except KeyError:
             return {}
         attachments: Dict[str, Union[bytes, List[bytes]]] = {}
+
         # Loop through attachments
-        for i in range(len(filenames)):
-            f = filenames[i]
-            if isinstance(f, str):
-                if filename is not None and f != filename:
-                    continue
-                name = f
-                f_dict = filenames[i + 1].get_object()
-                f_data = f_dict["/EF"]["/F"].get_data()
-                if name in attachments:
-                    if not isinstance(attachments[name], list):
-                        attachments[name] = [attachments[name]]  # type:ignore
-                    attachments[name].append(f_data)  # type:ignore
+        for i, name in enumerate(names):
+            if isinstance(name, str):
+                # Retrieve the corresponding reference.
+                file_dictionary = names[i + 1].get_object()
+            else:
+                # We have the reference, but need to determine the name.
+                file_dictionary = name.get_object()
+                for key in ["/UF", "/F"]:
+                    # PDF 2.0 reference, table 43:
+                    #   > A PDF reader shall use the value of the UF key, when present, instead of the F key.
+                    if key in file_dictionary:
+                        name = file_dictionary[key].get_object()
+                        break
                 else:
-                    attachments[name] = f_data
+                    continue
+                if name == names[i - 1]:
+                    # Avoid extracting the same file twice.
+                    continue
+
+            if filename is not None and name != filename:
+                continue
+            file_data = file_dictionary["/EF"]["/F"].get_data()
+            if name in attachments:
+                if not isinstance(attachments[name], list):
+                    attachments[name] = [attachments[name]]  # type:ignore
+                attachments[name].append(file_data)  # type:ignore
+            else:
+                attachments[name] = file_data
         return attachments
 
     @abstractmethod

diff --git a/tests/test_doc_common.py b/tests/test_doc_common.py
@@ -0,0 +1,71 @@
+"""Test the pypdf._doc_common module."""
+import re
+import shutil
+import subprocess
+from pathlib import Path
+
+import pytest
+
+from pypdf import PdfReader, PdfWriter
+
+TESTS_ROOT = Path(__file__).parent.resolve()
+PROJECT_ROOT = TESTS_ROOT.parent
+SAMPLE_ROOT = PROJECT_ROOT / "sample-files"
+
+PDFATTACH_BINARY = shutil.which("pdfattach")
+
+
+@pytest.mark.skipif(PDFATTACH_BINARY is None, reason="Requires poppler-utils")
+def test_attachments(tmpdir):
+    # No attachments.
+    clean_path = SAMPLE_ROOT / "002-trivial-libre-office-writer" / "002-trivial-libre-office-writer.pdf"
+    with PdfReader(clean_path) as pdf:
+        assert pdf._list_attachments() == []
+
+    # UF = name.
+    attached_path = tmpdir / "attached.pdf"
+    file_path = tmpdir / "test.txt"
+    file_path.write_binary(b"Hello World\n")
+    subprocess.run([PDFATTACH_BINARY, clean_path, file_path, attached_path])  # noqa: S603
+    with PdfReader(str(attached_path)) as pdf:
+        assert pdf._list_attachments() == ["test.txt"]
+        assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}
+
+    # UF != name.
+    different_path = tmpdir / "different.pdf"
+    different_path.write_binary(re.sub(rb" /UF [^/]+ /", b" /UF(my-file.txt) /", attached_path.read_binary()))
+    with PdfReader(str(different_path)) as pdf:
+        assert pdf._list_attachments() == ["test.txt", "my-file.txt"]
+        assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}
+        assert pdf._get_attachments("my-file.txt") == {"my-file.txt": b"Hello World\n"}
+
+    # Only name.
+    no_f_path = tmpdir / "no-f.pdf"
+    no_f_path.write_binary(re.sub(rb" /UF [^/]+ /", b" /", attached_path.read_binary()))
+    with PdfReader(str(no_f_path)) as pdf:
+        assert pdf._list_attachments() == ["test.txt"]
+        assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}
+
+    # UF and F.
+    uf_f_path = tmpdir / "uf-f.pdf"
+    uf_f_path.write_binary(attached_path.read_binary().replace(b" /UF ", b"/F(file.txt) /UF "))
+    with PdfReader(str(uf_f_path)) as pdf:
+        assert pdf._list_attachments() == ["test.txt"]
+        assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}
+
+    # Only F.
+    only_f_path = tmpdir / "f.pdf"
+    only_f_path.write_binary(attached_path.read_binary().replace(b" /UF ", b" /F "))
+    with PdfReader(str(only_f_path)) as pdf:
+        assert pdf._list_attachments() == ["test.txt"]
+        assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}
+
+
+def test_get_attachments__same_attachment_more_than_twice():
+    writer = PdfWriter()
+    writer.add_blank_page(100, 100)
+    for i in range(5):
+        writer.add_attachment("test.txt", f"content{i}")
+    assert writer._get_attachments("test.txt") == {
+        "test.txt": [b"content0", b"content1", b"content2", b"content3", b"content4"]
+    }