Skip to content

Commit

Permalink
ENH: Add parameter to select images to be removed
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz committed Sep 25, 2023
1 parent 06cc877 commit 2e89609
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 37 deletions.
2 changes: 2 additions & 0 deletions pypdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ._reader import DocumentInformation, PdfFileReader, PdfReader
from ._version import __version__
from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter
from .constants import ImageType
from .pagerange import PageRange, parse_filename_page_ranges
from .papersizes import PaperSize

Expand All @@ -31,6 +32,7 @@
__all__ = [
"__version__",
"_debug_versions",
"ImageType",
"PageRange",
"PaperSize",
"DocumentInformation",
Expand Down
107 changes: 71 additions & 36 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
FieldFlag,
FileSpecificationDictionaryEntries,
GoToActionArguments,
ImageType,
InteractiveFormDictEntries,
PageLabelStyle,
TypFitArguments,
Expand Down Expand Up @@ -132,12 +133,16 @@


class ObjectDeletionFlag(enum.IntFlag):
NONE = 0
TEXT = enum.auto()
IMAGES = enum.auto()
LINKS = enum.auto()
ATTACHMENTS = enum.auto()
OBJECTS_3D = enum.auto()
ALL_ANNOTATIONS = enum.auto()
XOBJECT_IMAGES = enum.auto()
INLINE_IMAGES = enum.auto()
DRAWING_IMAGES = enum.auto()
IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES


def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
Expand Down Expand Up @@ -2185,33 +2190,42 @@ def remove_objects_from_page(
if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:
return self._remove_annots_from_page(page, None)

if to_delete & ObjectDeletionFlag.IMAGES:
jump_operators = []
if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:
jump_operators = (
[b"w", b"J", b"j", b"M", b"d", b"i"]
+ [b"W", b"W*"]
+ [b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n"]
+ [b"m", b"l", b"c", b"v", b"y", b"h", b"re"]
+ [b"sh"]
)
else: # del text
if to_delete & ObjectDeletionFlag.TEXT:
jump_operators = [b"Tj", b"TJ", b"'", b'"']

def clean(content: ContentStream, images: List[str], forms: List[str]) -> None:
nonlocal to_delete
nonlocal jump_operators, to_delete
i = 0
while i < len(content.operations):
operands, operator = content.operations[i]
if operator in jump_operators:
if (
(
operator == b"INLINE IMAGE"
and (
cast(ObjectDeletionFlag, to_delete)
& ObjectDeletionFlag.INLINE_IMAGES
)
)
or (operator in jump_operators)
or (
operator == b"Do"
and (
cast(ObjectDeletionFlag, to_delete)
& ObjectDeletionFlag.XOBJECT_IMAGES
)
and (operands[0] in images)
)
):
del content.operations[i]
elif operator == b"Do":
if (
cast(ObjectDeletionFlag, to_delete) & ObjectDeletionFlag.IMAGES
and operands[0] in images
or cast(ObjectDeletionFlag, to_delete) & ObjectDeletionFlag.TEXT
and operands[0] in forms
):
del content.operations[i]
i += 1
else:
i += 1
content.get_data() # this ensures ._data is rebuilt from the .operations
Expand All @@ -2234,23 +2248,25 @@ def clean_forms(
try:
content: Any = None
if (
cast(ObjectDeletionFlag, to_delete) & ObjectDeletionFlag.IMAGES
cast(ObjectDeletionFlag, to_delete)
& ObjectDeletionFlag.XOBJECT_IMAGES
and o["/Subtype"] == "/Image"
):
content = NullObject()
content = NullObject() # to delete the image keeping the entry
images.append(k)
if o["/Subtype"] == "/Form":
forms.append(k)
if isinstance(o, ContentStream):
content = o
else:
content = ContentStream(o, self)
content.update(o.items())
for k1 in ["/Length", "/Filter", "/DecodeParms"]:
try:
del content[k1]
except KeyError:
pass
content.update(
{
k1: v1
for k1, v1 in o.items()
if k1 not in ["/Length", "/Filter", "/DecodeParms"]
}
)
clean_forms(content, stack + [elt]) # clean sub forms
if content is not None:
if isinstance(v, IndirectObject):
Expand All @@ -2261,6 +2277,8 @@ def clean_forms(
d[k] = self._add_object(content) # pragma: no cover
except (TypeError, KeyError):
pass
for im in images:
del d[im] # for clean-up
if isinstance(elt, StreamObject): # for /Form
if not isinstance(elt, ContentStream): # pragma: no cover
e = ContentStream(elt, self)
Expand All @@ -2269,40 +2287,57 @@ def clean_forms(
clean(elt, images, forms) # clean the content
return images, forms

if not isinstance(page, PageObject):
page = PageObject(self, page.indirect_reference)
if "/Contents" in page:
content = page["/Contents"].get_object()
content = cast(ContentStream, page.get_contents())

if not isinstance(content, ContentStream):
content = ContentStream(content, page)
images, forms = clean_forms(page, [])

clean(cast(ContentStream, content), images, forms)
if isinstance(page["/Contents"], ArrayObject):
for o in cast(ArrayObject, page["/Contents"]):
self._objects[o.idnum - 1] = NullObject()
try:
self._objects[
cast(IndirectObject, page["/Contents"].indirect_reference).idnum - 1
] = NullObject()
except AttributeError:
pass
page[NameObject("/Contents")] = self._add_object(content)
page.replace_contents(content)

def remove_images(self, ignore_byte_string_object: Optional[bool] = None) -> None:
def remove_images(
self,
to_delete: ImageType = ImageType.ALL,
ignore_byte_string_object: Optional[bool] = None,
) -> None:
"""
Remove images from this output.
Args:
ignore_byte_string_object: deprecated
"""
if isinstance(to_delete, bool):
ignore_byte_string_object = to_delete
to_delete = ImageType.ALL
if ignore_byte_string_object is not None:
warnings.warn(
"The 'ignore_byte_string_object' argument of remove_images is "
"deprecated and will be removed in pypdf 4.0.0.",
category=DeprecationWarning,
)
i = (
(
ObjectDeletionFlag.XOBJECT_IMAGES
if to_delete & ImageType.XOBJECT_IMAGES
else ObjectDeletionFlag.NONE
)
| (
ObjectDeletionFlag.INLINE_IMAGES
if to_delete & ImageType.INLINE_IMAGES
else ObjectDeletionFlag.NONE
)
| (
ObjectDeletionFlag.DRAWING_IMAGES
if to_delete & ImageType.DRAWING_IMAGES
else ObjectDeletionFlag.NONE
)
)
for page in self.pages:
self.remove_objects_from_page(page, ObjectDeletionFlag.IMAGES)
self.remove_objects_from_page(page, i)

def removeImages(self, ignoreByteStringObject: bool = False) -> None: # deprecated
"""
Expand All @@ -2311,7 +2346,7 @@ def removeImages(self, ignoreByteStringObject: bool = False) -> None: # depreca
.. deprecated:: 1.28.0
"""
deprecation_with_replacement("removeImages", "remove_images", "3.0.0")
return self.remove_images(ignoreByteStringObject)
return self.remove_images()

def remove_text(self, ignore_byte_string_object: Optional[bool] = None) -> None:
"""
Expand Down
11 changes: 10 additions & 1 deletion pypdf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
PDF Reference, sixth edition, Version 1.7, 2006.
"""

from enum import IntFlag
from enum import IntFlag, auto
from typing import Dict, Tuple


Expand Down Expand Up @@ -585,3 +585,12 @@ class AnnotationFlag(IntFlag):
TypArguments,
TypFitArguments,
)


class ImageType(IntFlag):
NONE = 0
XOBJECT_IMAGES = auto()
INLINE_IMAGES = auto()
DRAWING_IMAGES = auto()
ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
IMAGES = ALL # for consistancy with ObjectDeletionFlag
31 changes: 31 additions & 0 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytest

from pypdf import (
ImageType,
ObjectDeletionFlag,
PageObject,
PdfMerger,
Expand Down Expand Up @@ -1859,3 +1860,33 @@ def test_object_contains_indirect_reference_to_self():
outpage = writer.add_blank_page(width, height)
outpage.merge_page(reader.pages[6])
writer.append(reader)


def test_remove_image_per_type():
writer = PdfWriter(clone_from=RESOURCE_ROOT / "reportlab-inline-image.pdf")
writer.remove_images(ImageType.INLINE_IMAGES)

assert all(
x not in writer.pages[0].get_contents().get_data()
for x in (b"BI", b"ID", b"EI")
)

with pytest.raises(DeprecationWarning):
writer.remove_images(True)

writer = PdfWriter(clone_from=RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf")
writer.remove_images(ImageType.DRAWING_IMAGES)
assert all(
x not in writer.pages[1].get_contents().get_data()
for x in (b" re\n", b"W*", b"f*")
)
assert all(
x in writer.pages[1].get_contents().get_data() for x in (b" TJ\n", b"rg", b"Tm")
)
assert all(
x not in writer.pages[9]["/Resources"]["/XObject"]["/Meta84"].get_data()
for x in (b" re\n", b"W*", b"f*")
)
writer.remove_images(ImageType.XOBJECT_IMAGES)
assert b"Do\n" not in writer.pages[0].get_contents().get_data()
assert len(writer.pages[0]["/Resources"]["/XObject"]) == 0

0 comments on commit 2e89609

Please sign in to comment.