Skip to content

Commit

Permalink
BUG: Convert color space before saving (#1802)
Browse files Browse the repository at this point in the history
The PDF contained an image in PA mode:
* P: 8-bit pixels, mapped to any other mode using a color palette
* PA: P with alpha

See #1801
  • Loading branch information
MartinThoma authored Apr 19, 2023
1 parent f8295a9 commit bd140cb
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 2 deletions.
22 changes: 20 additions & 2 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
):
# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
mode: Literal["RGB", "P"] = "RGB"
mode: Literal["RGB", "P", "L", "RGBA"] = "RGB"
else:
mode = "P"
extension = None
Expand Down Expand Up @@ -683,11 +683,29 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
else:
img.putpalette(lookup.get_data())
img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB")
elif color_space is not None and color_space[0] == "/ICCBased":
# see Table 66 - Additional Entries Specific to an ICC Profile
# Stream Dictionary
icc_profile = color_space[1].get_object()
color_components = cast(int, icc_profile["/N"])
alternate_colorspace = icc_profile["/Alternate"]
color_space = alternate_colorspace
mode_map = {
"/DeviceGray": "L",
"/DeviceRGB": "RGB",
"/DeviceCMYK": "RGBA",
}
mode = (
mode_map.get(color_space) # type: ignore
or {1: "L", 3: "RGB", 4: "RGBA"}.get(color_components)
or mode
) # type: ignore
img = Image.frombytes(mode, size, data)
if G.S_MASK in x_object_obj: # add alpha channel
alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].get_data())
img.putalpha(alpha)
img_byte_arr = BytesIO()
img.save(img_byte_arr, format="PNG")
img.convert("RGBA").save(img_byte_arr, format="PNG")
data = img_byte_arr.getvalue()
elif x_object_obj[SA.FILTER] in (
[FT.LZW_DECODE],
Expand Down
25 changes: 25 additions & 0 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,3 +265,28 @@ def test_issue_1737():
reader.pages[0]["/Resources"]["/XObject"]["/Im0"].get_data()
reader.pages[0]["/Resources"]["/XObject"]["/Im1"].get_data()
reader.pages[0]["/Resources"]["/XObject"]["/Im2"].get_data()


@pytest.mark.enable_socket()
def test_pa_image_extraction():
"""
PNG images with PA mode can be extracted.
This is a regression test for issue #1801
"""
url = "https://github.com/py-pdf/pypdf/files/11250359/test_img.pdf"
name = "issue-1801.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))

page0 = reader.pages[0]
images = page0.images
assert len(images) == 1
assert images[0].name == "Im1.png"

# Ensure visual appearence
data = get_pdf_from_url(
"https://user-images.githubusercontent.com/"
"1658117/232842886-9d1b0726-3a5b-430d-8464-595d919c266c.png",
"issue-1801.png",
)
assert data == images[0].data

0 comments on commit bd140cb

Please sign in to comment.