py-pdf · MartinThoma · Sep 25, 2022 · Sep 25, 2022 · Sep 25, 2022 · Sep 25, 2022
diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py
@@ -416,7 +416,24 @@ def rename_kwargs(  # type: ignore
             )
 
 
+def _human_readable_bytes(bytes: int) -> str:
+    if bytes < 10**3:
+        return f"{bytes} Byte"
+    elif bytes < 10**6:
+        return f"{bytes / 10**3:.1f} kB"
+    elif bytes < 10**9:
+        return f"{bytes / 10**6:.1f} MB"
+    else:
+        return f"{bytes / 10**9:.1f} GB"
+
+
 @dataclass
 class File:
     name: str
     data: bytes
+
+    def __str__(self) -> str:
+        return f"File(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
+
+    def __repr__(self) -> str:
+        return f"File(name={self.name}, data: {_human_readable_bytes(len(self.data))}, hash: {hash(self.data)})"
diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
@@ -593,7 +593,12 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
 
             img = Image.frombytes(mode, size, data)
             if color_space == "/Indexed":
-                img.putpalette(lookup.get_data())
+                from .generic import ByteStringObject
+
+                if isinstance(lookup, ByteStringObject):
+                    img.putpalette(lookup)
+                else:
+                    img.putpalette(lookup.get_data())
                 img = img.convert("RGB")
             if G.S_MASK in x_object_obj:  # add alpha channel
                 alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].get_data())

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -7,7 +7,9 @@
 import PyPDF2._utils
 from PyPDF2 import PdfReader
 from PyPDF2._utils import (
+    File,
     _get_max_pdf_version_header,
+    _human_readable_bytes,
     deprecate_bookmark,
     mark_location,
     matrix_multiply,
@@ -256,3 +258,24 @@ def test_escapedcode_followed_by_int():
     reader = PdfReader(io.BytesIO(get_pdf_from_url(url, name=name)))
     for page in reader.pages:
         page.extract_text()
+
+
+@pytest.mark.parametrize(
+    ("input_int", "expected_output"),
+    [
+        (123, "123 Byte"),
+        (1234, "1.2 kB"),
+        (123_456, "123.5 kB"),
+        (1_234_567, "1.2 MB"),
+        (1_234_567_890, "1.2 GB"),
+        (1_234_567_890_000, "1234.6 GB"),
+    ],
+)
+def test_human_readable_bytes(input_int, expected_output):
+    assert _human_readable_bytes(input_int) == expected_output
+
+
+def test_file():
+    f = File(name="image.png", data=b"")
+    assert str(f) == "File(name=image.png, data: 0 Byte)"
+    assert repr(f) == "File(name=image.png, data: 0 Byte, hash: 0)"
diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -636,6 +636,7 @@ def test_merge_output(caplog):
             "https://corpora.tika.apache.org/base/docs/govdocs1/969/969502.pdf",
             "tika-969502.pdf",
         ),
+        ("https://arxiv.org/pdf/2201.00214.pdf", "arxiv-2201.00214.pdf"),
     ],
 )
 def test_image_extraction(url, name):