BUG: Convert color space before saving (#1802)

The PDF contained an image in PA mode: * P: 8-bit pixels, mapped to any other mode using a color palette * PA: P with alpha See #1801
py-pdf · Apr 19, 2023 · bd140cb · bd140cb
1 parent f8295a9
commit bd140cb
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 2 deletions.
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -652,7 +652,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
         and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
     ):
         # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
-        mode: Literal["RGB", "P"] = "RGB"
+        mode: Literal["RGB", "P", "L", "RGBA"] = "RGB"
     else:
         mode = "P"
     extension = None
@@ -683,11 +683,29 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
                 else:
                     img.putpalette(lookup.get_data())
                 img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB")
+            elif color_space is not None and color_space[0] == "/ICCBased":
+                # see Table 66 - Additional Entries Specific to an ICC Profile
+                # Stream Dictionary
+                icc_profile = color_space[1].get_object()
+                color_components = cast(int, icc_profile["/N"])
+                alternate_colorspace = icc_profile["/Alternate"]
+                color_space = alternate_colorspace
+                mode_map = {
+                    "/DeviceGray": "L",
+                    "/DeviceRGB": "RGB",
+                    "/DeviceCMYK": "RGBA",
+                }
+                mode = (
+                    mode_map.get(color_space)  # type: ignore
+                    or {1: "L", 3: "RGB", 4: "RGBA"}.get(color_components)
+                    or mode
+                )  # type: ignore
+                img = Image.frombytes(mode, size, data)
             if G.S_MASK in x_object_obj:  # add alpha channel
                 alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].get_data())
                 img.putalpha(alpha)
             img_byte_arr = BytesIO()
-            img.save(img_byte_arr, format="PNG")
+            img.convert("RGBA").save(img_byte_arr, format="PNG")
             data = img_byte_arr.getvalue()
         elif x_object_obj[SA.FILTER] in (
             [FT.LZW_DECODE],

diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -265,3 +265,28 @@ def test_issue_1737():
     reader.pages[0]["/Resources"]["/XObject"]["/Im0"].get_data()
     reader.pages[0]["/Resources"]["/XObject"]["/Im1"].get_data()
     reader.pages[0]["/Resources"]["/XObject"]["/Im2"].get_data()
+
+
+@pytest.mark.enable_socket()
+def test_pa_image_extraction():
+    """
+    PNG images with PA mode can be extracted.
+
+    This is a regression test for issue #1801
+    """
+    url = "https://github.com/py-pdf/pypdf/files/11250359/test_img.pdf"
+    name = "issue-1801.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+
+    page0 = reader.pages[0]
+    images = page0.images
+    assert len(images) == 1
+    assert images[0].name == "Im1.png"
+
+    # Ensure visual appearence
+    data = get_pdf_from_url(
+        "https://user-images.githubusercontent.com/"
+        "1658117/232842886-9d1b0726-3a5b-430d-8464-595d919c266c.png",
+        "issue-1801.png",
+    )
+    assert data == images[0].data