py-pdf · MartinThoma · Jun 5, 2022 · Jun 5, 2022
diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py
@@ -1292,7 +1292,7 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None:
         """
         Remove text from this output.
 
-        :param bool ignoreByteStringObject: optional parameter
+        :param bool ignore_byte_string_object: optional parameter
             to ignore ByteString Objects.
         """
         pg_dict = cast(DictionaryObject, self.get_object(self._pages))

diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py
@@ -1097,10 +1097,13 @@ def __init__(self, stream: Any, pdf: Any) -> None:
             data = b_("")
             for s in stream:
                 data += b_(s.get_object().get_data())
-            stream = BytesIO(b_(data))
+            stream_bytes = BytesIO(b_(data))
         else:
-            stream = BytesIO(b_(stream.get_data()))
-        self.__parseContentStream(stream)
+            stream_data = stream.get_data()
+            assert stream_data is not None
+            stream_data_bytes = b_(stream_data)
+            stream_bytes = BytesIO(stream_data_bytes)
+        self.__parseContentStream(stream_bytes)
 
     def __parseContentStream(self, stream: StreamType) -> None:
         # file("f:\\tmp.txt", "w").write(stream.read())

diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -132,6 +132,76 @@ def test_remove_text(input_path, ignore_byte_string_object):
     os.remove(tmp_filename)
 
 
+@pytest.mark.parametrize(
+    ("ignore_byte_string_object"),
+    [False, True],
+)
+def test_remove_text_all_operators(ignore_byte_string_object):
+    stream = (
+        b"BT "
+        b"/F0 36 Tf "
+        b"50 706 Td "
+        b"36 TL "
+        b"(The Tj operator) Tj "
+        b'1 2 (The double quote operator) " '
+        b"(The single quote operator) ' "
+        b"ET"
+    )
+    pdf_data = (
+        b"%%PDF-1.7\n"
+        b"1 0 obj << /Count 1 /Kids [5 0 R] /Type /Pages >> endobj\n"
+        b"2 0 obj << >> endobj\n"
+        b"3 0 obj << >> endobj\n"
+        b"4 0 obj << /Length %d >>\n"
+        b"stream\n" + (b"%s\n" % stream) + b"endstream\n"
+        b"endobj\n"
+        b"5 0 obj << /Contents 4 0 R /CropBox [0.0 0.0 2550.0 3508.0]\n"
+        b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
+        b" /Resources << /Font << >> >>"
+        b" /Rotate 0 /Type /Page >> endobj\n"
+        b"6 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
+        b"xref 1 6\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"trailer << /Root 6 0 R /Size 6 >>\n"
+        b"startxref\n%d\n"
+        b"%%%%EOF"
+    )
+    startx_correction = -1
+    pdf_data = pdf_data % (
+        len(stream),
+        pdf_data.find(b"1 0 obj") + startx_correction,
+        pdf_data.find(b"2 0 obj") + startx_correction,
+        pdf_data.find(b"3 0 obj") + startx_correction,
+        pdf_data.find(b"4 0 obj") + startx_correction,
+        pdf_data.find(b"5 0 obj") + startx_correction,
+        pdf_data.find(b"6 0 obj") + startx_correction,
+        # startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation
+        pdf_data.find(b"xref"),
+    )
+    print(pdf_data.decode())
+    pdf_stream = BytesIO(pdf_data)
+
+    reader = PdfReader(pdf_stream, strict=False)
+    writer = PdfWriter()
+
+    page = reader.pages[0]
+    writer.insert_page(page, 0)
+    writer.remove_text(ignore_byte_string_object=ignore_byte_string_object)
+
+    # finally, write "output" to PyPDF2-output.pdf
+    tmp_filename = "dont_commit_writer_removed_text.pdf"
+    with open(tmp_filename, "wb") as output_stream:
+        writer.write(output_stream)
+
+    # Cleanup
+    os.remove(tmp_filename)
+
+
 def test_write_metadata():
     pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf")