From e3c24873e19eaa69be527052d146e12f640a6659 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Wed, 6 Jul 2022 22:41:31 +0200
Subject: [PATCH 1/5] BUG: Byte math errors for decoding bitmap PNGs

Closes #535
Closes #536

Co-authored-by: Christopher Egner <chris@science.clinic>
---
 PyPDF2/filters.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
index 077143813..8b78af297 100644
--- a/PyPDF2/filters.py
+++ b/PyPDF2/filters.py
@@ -108,36 +108,49 @@ def decode(
         if predictor != 1:
             # The /Columns param. has 1 as the default value; see ISO 32000,
             # §7.4.4.3 LZWDecode and FlateDecode Parameters, Table 8
+            DEFAULT_BITS_PER_COMPONENT = 8
             if isinstance(decode_parms, ArrayObject):
                 columns = 1
+                bits_per_component = DEFAULT_BITS_PER_COMPONENT
                 for decode_parm in decode_parms:
                     if "/Columns" in decode_parm:
                         columns = decode_parm["/Columns"]
+                    if LZW.BITS_PER_COMPONENT in decode_parm:
+                        bits_per_component = decode_parm[LZW.BITS_PER_COMPONENT]
             else:
                 columns = (
                     1 if decode_parms is None else decode_parms.get(LZW.COLUMNS, 1)
                 )
+                bits_per_component = decode_parms.get(
+                    LZW.BITS_PER_COMPONENT, DEFAULT_BITS_PER_COMPONENT
+                )
+
+            # PNG predictor can vary by row and so is the lead byte on each row
+            rowlength = (
+                math.ceil(columns * bits_per_component / 8) + 1
+            )  # number of bytes
 
             # PNG prediction:
             if 10 <= predictor <= 15:
-                str_data = FlateDecode._decode_png_prediction(str_data, columns)  # type: ignore
+                str_data = FlateDecode._decode_png_prediction(str_data, columns, rowlength)  # type: ignore
             else:
                 # unsupported predictor
                 raise PdfReadError(f"Unsupported flatedecode predictor {predictor!r}")
         return str_data
 
     @staticmethod
-    def _decode_png_prediction(data: str, columns: int) -> str:
-        output = StringIO()
+    def _decode_png_prediction(data: str, columns: int, rowlength: int) -> bytes:
+        output = BytesIO()
         # PNG prediction can vary from row to row
-        rowlength = columns + 1
-        assert len(data) % rowlength == 0
+        if len(data) % rowlength != 0:
+            raise PdfReadError("Image data is not rectangular")
         prev_rowdata = (0,) * rowlength
         for row in range(len(data) // rowlength):
             rowdata = [
                 ord_(x) for x in data[(row * rowlength) : ((row + 1) * rowlength)]
             ]
             filter_byte = rowdata[0]
+
             if filter_byte == 0:
                 pass
             elif filter_byte == 1:
@@ -162,7 +175,7 @@ def _decode_png_prediction(data: str, columns: int) -> str:
                 # unsupported PNG filter
                 raise PdfReadError(f"Unsupported PNG filter {filter_byte!r}")
             prev_rowdata = tuple(rowdata)
-            output.write("".join([chr(x) for x in rowdata[1:]]))
+            output.write(bytearray(rowdata[1:]))
         return output.getvalue()
 
     @staticmethod

From 9bf83da6e47b3e534279c5ff18c3caf6e3c7092b Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Thu, 7 Jul 2022 06:32:34 +0200
Subject: [PATCH 2/5] StringIO is no longer needed

---
 PyPDF2/filters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
index 8b78af297..6dbde7093 100644
--- a/PyPDF2/filters.py
+++ b/PyPDF2/filters.py
@@ -37,7 +37,7 @@
 import math
 import struct
 import zlib
-from io import BytesIO, StringIO
+from io import BytesIO
 from typing import Any, Dict, Optional, Tuple, Union
 
 from .generic import ArrayObject, DictionaryObject, NameObject

From 5942dfa4aecea37285c2340067e31f5a3a21bd7f Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Thu, 7 Jul 2022 07:14:39 +0200
Subject: [PATCH 3/5] Catch none

---
 PyPDF2/filters.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
index 6dbde7093..00111ccd9 100644
--- a/PyPDF2/filters.py
+++ b/PyPDF2/filters.py
@@ -121,8 +121,10 @@ def decode(
                 columns = (
                     1 if decode_parms is None else decode_parms.get(LZW.COLUMNS, 1)
                 )
-                bits_per_component = decode_parms.get(
-                    LZW.BITS_PER_COMPONENT, DEFAULT_BITS_PER_COMPONENT
+                bits_per_component = (
+                    decode_parms.get(LZW.BITS_PER_COMPONENT, DEFAULT_BITS_PER_COMPONENT)
+                    if decode_parms
+                    else DEFAULT_BITS_PER_COMPONENT
                 )
 
             # PNG predictor can vary by row and so is the lead byte on each row

From 71ce1208074bfd1be03aaa031b8762338e813da0 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sat, 9 Jul 2022 10:04:41 +0200
Subject: [PATCH 4/5] Misc

---
 .gitignore              |  1 +
 PyPDF2/filters.py       | 13 +++++++++----
 tests/test_reader.py    |  1 -
 tests/test_workflows.py | 35 +++++++++++++++++++++++++++++++++++
 4 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index f4ec295c5..97f93ad19 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,3 +31,4 @@ PyPDF2_pdfLocation.txt
 .python-version
 tests/pdf_cache/
 docs/meta/CHANGELOG.md
+extracted-images/
diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
index 00111ccd9..a5ad61ff9 100644
--- a/PyPDF2/filters.py
+++ b/PyPDF2/filters.py
@@ -425,7 +425,7 @@ def _get_parameters(
         parameters: Union[None, ArrayObject, DictionaryObject], rows: int
     ) -> CCITParameters:
         k = 0
-        columns = 0
+        columns = 1728  # TABLE 3.9 Optional parameters for the CCITTFaxDecode filter
         if parameters:
             if isinstance(parameters, ArrayObject):
                 for decode_parm in parameters:
@@ -434,8 +434,10 @@ def _get_parameters(
                     if CCITT.K in decode_parm:
                         k = decode_parm[CCITT.K]
             else:
-                columns = parameters[CCITT.COLUMNS]  # type: ignore
-                k = parameters[CCITT.K]  # type: ignore
+                if CCITT.COLUMNS in parameters:
+                    columns = parameters[CCITT.COLUMNS]  # type: ignore
+                if CCITT.K in parameters:
+                    k = parameters[CCITT.K]  # type: ignore
 
         return CCITParameters(k, columns, rows)
 
@@ -556,7 +558,10 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
 
     size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT])
     data = x_object_obj.get_data()  # type: ignore
-    if x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB:
+    if (
+        IA.COLOR_SPACE in x_object_obj
+        and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
+    ):
         mode: Literal["RGB", "P"] = "RGB"
     else:
         mode = "P"
diff --git a/tests/test_reader.py b/tests/test_reader.py
index 13c52c1ea..12378fdef 100644
--- a/tests/test_reader.py
+++ b/tests/test_reader.py
@@ -705,7 +705,6 @@ def test_read_not_binary_mode():
             PdfReader(f)
 
 
-@pytest.mark.xfail(reason="#416")
 def test_read_form_416():
     url = (
         "https://www.fda.gov/downloads/AboutFDA/ReportsManualsForms/Forms/UCM074728.pdf"
diff --git a/tests/test_workflows.py b/tests/test_workflows.py
index 28d52552c..601a13041 100644
--- a/tests/test_workflows.py
+++ b/tests/test_workflows.py
@@ -2,12 +2,16 @@
 import os
 import sys
 from io import BytesIO
+from pathlib import Path
 
 import pytest
 
 from PyPDF2 import PdfMerger, PdfReader, PdfWriter
+from PyPDF2.constants import ImageAttributes as IA
 from PyPDF2.constants import PageAttributes as PG
+from PyPDF2.constants import Ressources as RES
 from PyPDF2.errors import PdfReadError, PdfReadWarning
+from PyPDF2.filters import _xobj_to_image
 
 from . import get_pdf_from_url
 
@@ -372,3 +376,34 @@ def test_merge_output():
 
     # Cleanup
     merger.close()
+
+
+def test_image_extraction():
+    url = "https://corpora.tika.apache.org/base/docs/govdocs1/994/994636.pdf"
+    name = "tika-994636.pdf"
+    data = BytesIO(get_pdf_from_url(url, name=name))
+    reader = PdfReader(data)
+
+    images_extracted = []
+    root = Path("extracted-images")
+    if not root.exists():
+        os.mkdir(root)
+
+    for page in reader.pages:
+        if RES.XOBJECT in page[PG.RESOURCES]:
+            x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()
+
+            for obj in x_object:
+                if x_object[obj][IA.SUBTYPE] == "/Image":
+                    extension, byte_stream = _xobj_to_image(x_object[obj])
+                    if extension is not None:
+                        filename = root / (obj[1:] + extension)
+                        with open(filename, "wb") as img:
+                            img.write(byte_stream)
+                        images_extracted.append(filename)
+
+    # Cleanup
+    return
+    for filepath in images_extracted:
+        if os.path.exists(filepath):
+            os.remove(filepath)

From 408bff58955d191e5e22b89452a94b7da2f7b414 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sat, 9 Jul 2022 11:20:55 +0200
Subject: [PATCH 5/5] Add Indexed color space support

---
 PyPDF2/filters.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
index 7aaca877b..543844454 100644
--- a/PyPDF2/filters.py
+++ b/PyPDF2/filters.py
@@ -570,7 +570,21 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
     if SA.FILTER in x_object_obj:
         if x_object_obj[SA.FILTER] == FT.FLATE_DECODE:
             extension = ".png"
+            color_space = None
+            if "/ColorSpace" in x_object_obj:
+                color_space = x_object_obj["/ColorSpace"].get_object()
+                if (
+                    isinstance(color_space, ArrayObject)
+                    and color_space[0] == "/Indexed"
+                ):
+                    color_space, base, hival, lookup = (
+                        value.get_object() for value in color_space
+                    )
+
             img = Image.frombytes(mode, size, data)
+            if color_space == "/Indexed":
+                img.putpalette(lookup.get_data())
+                img = img.convert("RGB")
             if G.S_MASK in x_object_obj:  # add alpha channel
                 alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].get_data())
                 img.putalpha(alpha)