Skip to content

Commit

Permalink
ROB: cope with some images extractions issues
Browse files Browse the repository at this point in the history
closes #2343:
1st case : image with images in 1 byte encoding with Separation colorspace

2nd case: similar + \n to be ignored at the end of the image data
  • Loading branch information
pubpub-zz committed Apr 8, 2024
1 parent 5c6a7b6 commit bd88388
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 6 deletions.
16 changes: 15 additions & 1 deletion pypdf/_xobj_image_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,20 @@ def _get_imagemode(
return mode, mode == "CMYK"


def _extended_image_frombytes(
mode: str, size: Tuple[int, int], data: bytes
) -> Any: # Any = ImageType
try:
img = Image.frombytes(mode, size, data)
except ValueError as exc:
if len(data) % (size[0] * size[1]) != 0:
raise exc
k = size[0] * size[1] * len(mode) / len(data)
data = b"".join([bytes((x,) * int(k)) for x in data])
img = Image.frombytes(mode, size, data)
return img


def _handle_flate(
size: Tuple[int, int],
data: bytes,
Expand Down Expand Up @@ -168,7 +182,7 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
elif mode == "4bits":
mode = "P"
data = bits2byte(data, size, 4)
img = Image.frombytes(mode, size, data)
img = _extended_image_frombytes(mode, size, data)
if color_space == "/Indexed":
from .generic import TextStringObject

Expand Down
22 changes: 17 additions & 5 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,9 @@ def decode(
index = 0
while True:
if index >= len(data):
logger_warning("missing EOD in ASCIIHexDecode, check if output is OK", __name__)
logger_warning(
"missing EOD in ASCIIHexDecode, check if output is OK", __name__
)
break # reach End Of String even if no EOD
char = data[index : index + 1]
if char == b">":
Expand Down Expand Up @@ -341,7 +343,9 @@ def decode(
index = 0
while True:
if index >= len(data):
logger_warning("missing EOD in RunLengthDecode, check if output is OK", __name__)
logger_warning(
"missing EOD in RunLengthDecode, check if output is OK", __name__
)
break # reach End Of String even if no EOD
length = data[index]
index += 1
Expand Down Expand Up @@ -733,6 +737,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
"""
from ._xobj_image_helpers import (
Image,
_extended_image_frombytes,
_get_imagemode,
_handle_flate,
_handle_jpx,
Expand All @@ -747,10 +752,12 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
else:
obj_as_text = x_object_obj.__repr__()

size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT])
size = (cast(int, x_object_obj[IA.WIDTH]), cast(int, x_object_obj[IA.HEIGHT]))
data = x_object_obj.get_data() # type: ignore
if isinstance(data, str): # pragma: no cover
data = data.encode()
if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n'
data = data[:-1]
colors = x_object_obj.get("/Colors", 1)
color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object()
if isinstance(color_space, list) and len(color_space) == 1:
Expand Down Expand Up @@ -819,7 +826,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
)
elif mode == "CMYK":
img, image_format, extension, invert_color = (
Image.frombytes(mode, size, data),
_extended_image_frombytes(mode, size, data),
"TIFF",
".tif",
False,
Expand All @@ -828,7 +835,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
raise PdfReadError(f"ColorSpace field not found in {x_object_obj}")
else:
img, image_format, extension, invert_color = (
Image.frombytes(mode, size, data),
_extended_image_frombytes(mode, size, data),
"PNG",
".png",
False,
Expand All @@ -849,6 +856,11 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
and color_space[0].get_object() == "/Indexed"
):
decode = None # decode is meanless of Indexed
if (
isinstance(color_space, ArrayObject)
and color_space[0].get_object() == "/Separation"
):
decode = [1.0, 0.0] * len(img.getbands())
if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))):
lut: List[int] = []
for i in range(0, len(decode), 2):
Expand Down
32 changes: 32 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,3 +255,35 @@ def test_cmyk_no_filter():
name = "iss2522.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
reader.pages[0].images[0].image


@pytest.mark.enable_socket()
def test_separation_1byte_to_rgb_inverted():
"""Cf #2343"""
url = "https://github.com/py-pdf/pypdf/files/13679585/test2_P038-038.pdf"
name = "iss2343.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
url = (
"https://private-user-images.githubusercontent.com/4083478/"
"320313822-b7f41897-96ef-4ea6-b165-5ef307a92b87.png?"
"jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MTI1MjQ0OTgsIm5iZiI6MTcxMjUyNDE5OCwicGF0aCI6Ii80MDgzNDc4LzMyMDMxMzgyMi1iN2Y0MTg5Ny05NmVmLTRlYTYtYjE2NS01ZWYzMDdhOTJiODcucG5nP1gtQW16LUFsZ29yaXRobT1BV1M0LUhNQUMtU0hBMjU2JlgtQW16LUNyZWRlbnRpYWw9QUtJQVZDT0RZTFNBNTNQUUs0WkElMkYyMDI0MDQwNyUyRnVzLWVhc3QtMSUyRnMzJTJGYXdzNF9yZXF1ZXN0JlgtQW16LURhdGU9MjAyNDA0MDdUMjEwOTU4WiZYLUFtei1FeHBpcmVzPTMwMCZYLUFtei1TaWduYXR1cmU9NDAxNzIxOTM4YjdkMjE5NGQ1MGY1NzMzYjY2ZmMwOWYzMjk1ZWQzMzMxNDIyOGI2ZThmOTg2NTRiOWNkZDRkYSZYLUFtei1TaWduZWRIZWFkZXJzPWhvc3QmYWN0b3JfaWQ9MCZrZXlfaWQ9MCZyZXBvX2lkPTAifQ.jlaFEusLfEElwynVjtUk5VtMB1bZAjKfPJdYX3L_gBY"
)
name = "iss2343.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(reader.pages[0].images[0].image, img) >= 0.99


@pytest.mark.enable_socket()
def test_data_with_lf():
"""Cf #2343"""
url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf"
name = "iss2343b.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
url = (
"https://private-user-images.githubusercontent.com/4083478/"
"320594409-1120b0cf-a67a-403f-aa1a-9a191cbc087f.png?"
"jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MTI1OTc5MTAsIm5iZiI6MTcxMjU5NzYxMCwicGF0aCI6Ii80MDgzNDc4LzMyMDU5NDQwOS0xMTIwYjBjZi1hNjdhLTQwM2YtYWExYS05YTE5MWNiYzA4N2YucG5nP1gtQW16LUFsZ29yaXRobT1BV1M0LUhNQUMtU0hBMjU2JlgtQW16LUNyZWRlbnRpYWw9QUtJQVZDT0RZTFNBNTNQUUs0WkElMkYyMDI0MDQwOCUyRnVzLWVhc3QtMSUyRnMzJTJGYXdzNF9yZXF1ZXN0JlgtQW16LURhdGU9MjAyNDA0MDhUMTczMzMwWiZYLUFtei1FeHBpcmVzPTMwMCZYLUFtei1TaWduYXR1cmU9YWMxY2U0NzU1NWVkYzQwNWViNmYyOTFjOWQ0ZTkyN2Q3NmM1NTg4MGM4ZGZiYzRhYjg4MmVkNjRiZDcxZGRkMSZYLUFtei1TaWduZWRIZWFkZXJzPWhvc3QmYWN0b3JfaWQ9MCZrZXlfaWQ9MCZyZXBvX2lkPTAifQ.mDhPeMtRdV2ut1683_5JTJfxfb-wYSFGbWITR5RnRNw"
)
name = "iss2343b0.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(reader.pages[8].images[9].image, img) >= 0.99

0 comments on commit bd88388

Please sign in to comment.