From 2b1e6d9cf22e78bba08047187c2144022e4f625e Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 5 Jun 2022 11:37:17 +0200 Subject: [PATCH] TST: writer.remove_text --- PyPDF2/_writer.py | 2 +- PyPDF2/generic.py | 9 ++++-- tests/test_writer.py | 70 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 4 deletions(-) diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index 26fe8566b..95648d9fa 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -1292,7 +1292,7 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None: """ Remove text from this output. - :param bool ignoreByteStringObject: optional parameter + :param bool ignore_byte_string_object: optional parameter to ignore ByteString Objects. """ pg_dict = cast(DictionaryObject, self.get_object(self._pages)) diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index f676ac230..6cdccc60f 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -1097,10 +1097,13 @@ def __init__(self, stream: Any, pdf: Any) -> None: data = b_("") for s in stream: data += b_(s.get_object().get_data()) - stream = BytesIO(b_(data)) + stream_bytes = BytesIO(b_(data)) else: - stream = BytesIO(b_(stream.get_data())) - self.__parseContentStream(stream) + stream_data = stream.get_data() + assert stream_data is not None + stream_data_bytes = b_(stream_data) + stream_bytes = BytesIO(stream_data_bytes) + self.__parseContentStream(stream_bytes) def __parseContentStream(self, stream: StreamType) -> None: # file("f:\\tmp.txt", "w").write(stream.read()) diff --git a/tests/test_writer.py b/tests/test_writer.py index 4a581645e..5eb841a8d 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -132,6 +132,76 @@ def test_remove_text(input_path, ignore_byte_string_object): os.remove(tmp_filename) +@pytest.mark.parametrize( + ("ignore_byte_string_object"), + [False, True], +) +def test_remove_text_all_operators(ignore_byte_string_object): + stream = ( + b"BT " + b"/F0 36 Tf " + b"50 706 Td " + b"36 TL " + b"(The Tj operator) Tj " + b'1 2 (The double quote operator) " ' + b"(The single quote operator) ' " + b"ET" + ) + pdf_data = ( + b"%%PDF-1.7\n" + b"1 0 obj << /Count 1 /Kids [5 0 R] /Type /Pages >> endobj\n" + b"2 0 obj << >> endobj\n" + b"3 0 obj << >> endobj\n" + b"4 0 obj << /Length %d >>\n" + b"stream\n" + (b"%s\n" % stream) + b"endstream\n" + b"endobj\n" + b"5 0 obj << /Contents 4 0 R /CropBox [0.0 0.0 2550.0 3508.0]\n" + b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" + b" /Resources << /Font << >> >>" + b" /Rotate 0 /Type /Page >> endobj\n" + b"6 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" + b"xref 1 6\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"trailer << /Root 6 0 R /Size 6 >>\n" + b"startxref\n%d\n" + b"%%%%EOF" + ) + startx_correction = -1 + pdf_data = pdf_data % ( + len(stream), + pdf_data.find(b"1 0 obj") + startx_correction, + pdf_data.find(b"2 0 obj") + startx_correction, + pdf_data.find(b"3 0 obj") + startx_correction, + pdf_data.find(b"4 0 obj") + startx_correction, + pdf_data.find(b"5 0 obj") + startx_correction, + pdf_data.find(b"6 0 obj") + startx_correction, + # startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation + pdf_data.find(b"xref"), + ) + print(pdf_data.decode()) + pdf_stream = BytesIO(pdf_data) + + reader = PdfReader(pdf_stream, strict=False) + writer = PdfWriter() + + page = reader.pages[0] + writer.insert_page(page, 0) + writer.remove_text(ignore_byte_string_object=ignore_byte_string_object) + + # finally, write "output" to PyPDF2-output.pdf + tmp_filename = "dont_commit_writer_removed_text.pdf" + with open(tmp_filename, "wb") as output_stream: + writer.write(output_stream) + + # Cleanup + os.remove(tmp_filename) + + def test_write_metadata(): pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf")