From a41c4979bc5f18224711352390bbe1a68ad1392c Mon Sep 17 00:00:00 2001 From: Michael Karlen Date: Fri, 16 Sep 2022 13:31:54 +0200 Subject: [PATCH] Fix performance issues with large embedded base64 images Certain PDF libraries do embed images as base64 strings. This causes performance issues in `read_string_from_stream` due to incremental string concatenation, byte by byte. PDF Lib in our case is ``` Canon iR-ADV C256 PDF PDF Annotator 8.0.0.826 [Adobe PSL 1.3e for Canon ``` --- PyPDF2/generic/_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PyPDF2/generic/_utils.py b/PyPDF2/generic/_utils.py index c5b6129f0..b7a0ee416 100644 --- a/PyPDF2/generic/_utils.py +++ b/PyPDF2/generic/_utils.py @@ -41,7 +41,7 @@ def read_string_from_stream( ) -> Union["TextStringObject", "ByteStringObject"]: tok = stream.read(1) parens = 1 - txt = b"" + txt = [] while True: tok = stream.read(1) if not tok: @@ -106,8 +106,8 @@ def read_string_from_stream( else: msg = rf"Unexpected escaped string: {tok.decode('utf8')}" logger_warning(msg, __name__) - txt += tok - return create_string_object(txt, forced_encoding) + txt.append(tok) + return create_string_object(b''.join(txt), forced_encoding) def create_string_object( @@ -164,7 +164,7 @@ def decode_pdfdocencoding(byte_array: bytes) -> str: raise UnicodeDecodeError( "pdfdocencoding", bytearray(b), - -1, + -1, -1, "does not exist in translation table", )