Fix performance issues with large embedded base64 images

Certain PDF libraries do embed images as base64 strings. This causes performance issues in `read_string_from_stream` due to incremental string concatenation, byte by byte. PDF Lib in our case is ``` <xmp:CreatorTool>Canon iR-ADV C256 PDF</xmp:CreatorTool> <pdf:Producer>PDF Annotator 8.0.0.826 [Adobe PSL 1.3e for Canon</pdf:Producer> ```
py-pdf · Sep 16, 2022 · a41c497 · a41c497
1 parent 7c96d13
commit a41c497
Showing 1 changed file with 4 additions and 4 deletions.
diff --git a/PyPDF2/generic/_utils.py b/PyPDF2/generic/_utils.py
@@ -41,7 +41,7 @@ def read_string_from_stream(
 ) -> Union["TextStringObject", "ByteStringObject"]:
     tok = stream.read(1)
     parens = 1
-    txt = b""
+    txt = []
     while True:
         tok = stream.read(1)
         if not tok:
@@ -106,8 +106,8 @@ def read_string_from_stream(
                 else:
                     msg = rf"Unexpected escaped string: {tok.decode('utf8')}"
                     logger_warning(msg, __name__)
-        txt += tok
-    return create_string_object(txt, forced_encoding)
+        txt.append(tok)
+    return create_string_object(b''.join(txt), forced_encoding)
 
 
 def create_string_object(
@@ -164,7 +164,7 @@ def decode_pdfdocencoding(byte_array: bytes) -> str:
             raise UnicodeDecodeError(
                 "pdfdocencoding",
                 bytearray(b),
-                -1,
+                    -1,
                 -1,
                 "does not exist in translation table",
             )