PI: Avoid string concatenation with large embedded base64-encoded ima…

…ges (#1350) Certain PDF libraries do embed images as base64 strings. This causes performance issues in `read_string_from_stream` due to incremental string concatenation, byte by byte. An example for such a library is `Canon iR-ADV C256 PDF` (PDF Annotator 8.0.0.826 - Adobe PSL 1.3e for Canon) Co-authored-by: Michael Karlen <[email protected]>
py-pdf · Sep 17, 2022 · 3be01fd · 3be01fd
1 parent 7c96d13
commit 3be01fd
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/PyPDF2/generic/_utils.py b/PyPDF2/generic/_utils.py
@@ -41,7 +41,7 @@ def read_string_from_stream(
 ) -> Union["TextStringObject", "ByteStringObject"]:
     tok = stream.read(1)
     parens = 1
-    txt = b""
+    txt = []
     while True:
         tok = stream.read(1)
         if not tok:
@@ -106,8 +106,8 @@ def read_string_from_stream(
                 else:
                     msg = rf"Unexpected escaped string: {tok.decode('utf8')}"
                     logger_warning(msg, __name__)
-        txt += tok
-    return create_string_object(txt, forced_encoding)
+        txt.append(tok)
+    return create_string_object(b''.join(txt), forced_encoding)
 
 
 def create_string_object(