Skip to content

Commit

Permalink
TST: writer.remove_text
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Jun 5, 2022
1 parent 34919f9 commit 6ed6727
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 4 deletions.
2 changes: 1 addition & 1 deletion PyPDF2/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1292,7 +1292,7 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None:
"""
Remove text from this output.
:param bool ignoreByteStringObject: optional parameter
:param bool ignore_byte_string_object: optional parameter
to ignore ByteString Objects.
"""
pg_dict = cast(DictionaryObject, self.get_object(self._pages))
Expand Down
15 changes: 12 additions & 3 deletions PyPDF2/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1097,10 +1097,19 @@ def __init__(self, stream: Any, pdf: Any) -> None:
data = b_("")
for s in stream:
data += b_(s.get_object().get_data())
stream = BytesIO(b_(data))
stream_bytes = BytesIO(b_(data))
elif isinstance(stream, (EncodedStreamObject, DecodedStreamObject)):
stream_data = stream.get_data()
assert stream_data is not None
stream_data_bytes = b_(stream_data)
stream_bytes = BytesIO(stream_data_bytes)
elif isinstance(stream, DictionaryObject):
stream_bytes = BytesIO()
stream.write_to_stream(stream_bytes, None)
stream_bytes.seek(0)
else:
stream = BytesIO(b_(stream.get_data()))
self.__parseContentStream(stream)
stream_bytes = BytesIO(b_(stream.get_data()))
self.__parseContentStream(stream_bytes)

def __parseContentStream(self, stream: StreamType) -> None:
# file("f:\\tmp.txt", "w").write(stream.read())
Expand Down
70 changes: 70 additions & 0 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,76 @@ def test_remove_text(input_path, ignore_byte_string_object):
os.remove(tmp_filename)


@pytest.mark.parametrize(
("ignore_byte_string_object"),
[False, True],
)
def test_remove_text_all_operators(ignore_byte_string_object):
stream = (
b"BT "
b"/F0 36 Tf "
b"50 706 Td "
b"36 TL "
b"(The Tj operator) Tj "
b'1 2 (The double quote operator) " '
b"(The single quote operator) ' "
b"ET"
)
pdf_data = (
b"%%PDF-1.7\n"
b"1 0 obj << /Count 1 /Kids [5 0 R] /Type /Pages >> endobj\n"
b"2 0 obj << >> endobj\n"
b"3 0 obj << >> endobj\n"
b"4 0 obj << /Length %d >>\n"
b"stream\n" + (b"%s\n" % stream) + b"endstream\n"
b"endobj\n"
b"5 0 obj << /Contents 4 0 R /CropBox [0.0 0.0 2550.0 3508.0]\n"
b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
b" /Resources << /Font << >> >>"
b" /Rotate 0 /Type /Page >> endobj\n"
b"6 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
b"xref 1 6\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"trailer << /Root 6 0 R /Size 6 >>\n"
b"startxref\n%d\n"
b"%%%%EOF"
)
startx_correction = -1
pdf_data = pdf_data % (
len(stream),
pdf_data.find(b"1 0 obj") + startx_correction,
pdf_data.find(b"2 0 obj") + startx_correction,
pdf_data.find(b"3 0 obj") + startx_correction,
pdf_data.find(b"4 0 obj") + startx_correction,
pdf_data.find(b"5 0 obj") + startx_correction,
pdf_data.find(b"6 0 obj") + startx_correction,
# startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation
pdf_data.find(b"xref"),
)
print(pdf_data.decode())
pdf_stream = BytesIO(pdf_data)

reader = PdfReader(pdf_stream, strict=False)
writer = PdfWriter()

page = reader.pages[0]
writer.insert_page(page, 0)
writer.remove_text(ignore_byte_string_object=ignore_byte_string_object)

# finally, write "output" to PyPDF2-output.pdf
tmp_filename = "dont_commit_writer_removed_text.pdf"
with open(tmp_filename, "wb") as output_stream:
writer.write(output_stream)

# Cleanup
os.remove(tmp_filename)


def test_write_metadata():
pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf")

Expand Down

0 comments on commit 6ed6727

Please sign in to comment.