Skip to content

Commit

Permalink
Merge branch 'main' into iss2138
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz authored Sep 11, 2023
2 parents 9db0bd5 + fb35485 commit 125b4ae
Show file tree
Hide file tree
Showing 10 changed files with 112 additions and 57 deletions.
21 changes: 21 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,26 @@
# CHANGELOG

## Version 3.16.0, 2023-09-10

### Security (SEC)
- Infinite recursion caused by IndirectObject clone (#2156)

### New Features (ENH)
- Ease access to ViewerPreferences (#2144)

### Bug Fixes (BUG)
- Catch the case where w[0] is an IndirectObject instead of an int (#2154)
- Cope with indirect objects in filters and remove deprecated code (#2177)
- Accept tabs in cmaps (#2174) / cope with extra space (#2151)
- Merge pages without resources (#2150)
- getcontents() shall return None if contents is NullObject (#2161)
- Fix conversion from 1 to LA (#2175)

### Robustness (ROB)
- Accept XYZ with no arguments (#2178)

[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.15.5...3.16.0)

## Version 3.15.5, 2023-09-03

### Bug Fixes (BUG)
Expand Down
2 changes: 1 addition & 1 deletion pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ def compute_space_width(
else:
w = []
while len(w) > 0:
st = w[0]
st = w[0] if isinstance(w[0], int) else w[0].get_object()
second = w[1].get_object()
if isinstance(second, int):
for x in range(st, second):
Expand Down
7 changes: 6 additions & 1 deletion pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1183,8 +1183,13 @@ def _merge_page_writer(
pdf = self.indirect_reference.pdf

rename = {}
if PG.RESOURCES not in self:
self[NameObject(PG.RESOURCES)] = DictionaryObject()
original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())
page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())
if PG.RESOURCES not in page2:
page2resources = DictionaryObject()
else:
page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())

for res in (
RES.EXT_G_STATE,
Expand Down
2 changes: 1 addition & 1 deletion pypdf/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.15.5"
__version__ = "3.16.0"
1 change: 1 addition & 0 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,6 +862,7 @@ def _update_text_field(self, field: DictionaryObject) -> None:
# Extract font information
da = cast(str, field[AA.DA])
font_properties = da.replace("\n", " ").replace("\r", " ").split(" ")
font_properties = [x for x in font_properties if x != ""]
font_name = font_properties[font_properties.index("Tf") - 2]
font_height = float(font_properties[font_properties.index("Tf") - 1])
if font_height == 0:
Expand Down
91 changes: 42 additions & 49 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from ._utils import (
b_,
deprecate_with_replacement,
deprecation_no_replacement,
logger_warning,
ord_,
)
Expand All @@ -53,7 +54,7 @@
from .constants import ImageAttributes as IA
from .constants import LzwFilterParameters as LZW
from .constants import StreamAttributes as SA
from .errors import PdfReadError, PdfStreamError
from .errors import DeprecationError, PdfReadError, PdfStreamError
from .generic import (
ArrayObject,
DictionaryObject,
Expand Down Expand Up @@ -93,7 +94,7 @@ class FlateDecode:
@staticmethod
def decode(
data: bytes,
decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
"""
Expand All @@ -113,42 +114,37 @@ def decode(
if "decodeParms" in kwargs: # deprecated
deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
decode_parms = kwargs["decodeParms"]
if isinstance(decode_parms, ArrayObject): # type: ignore
raise DeprecationError("decode_parms as ArrayObject is depreciated")

str_data = decompress(data)
predictor = 1

if decode_parms:
try:
if isinstance(decode_parms, ArrayObject):
for decode_parm in decode_parms:
if "/Predictor" in decode_parm:
predictor = decode_parm["/Predictor"]
else:
predictor = decode_parms.get("/Predictor", 1)
predictor = decode_parms.get("/Predictor", 1)
except (AttributeError, TypeError): # Type Error is NullObject
pass # Usually an array with a null object was read
# predictor 1 == no predictor
if predictor != 1:
# The /Columns param. has 1 as the default value; see ISO 32000,
# §7.4.4.3 LZWDecode and FlateDecode Parameters, Table 8
DEFAULT_BITS_PER_COMPONENT = 8
if isinstance(decode_parms, ArrayObject):
try:
columns = cast(int, decode_parms[LZW.COLUMNS].get_object()) # type: ignore
except (TypeError, KeyError):
columns = 1
bits_per_component = DEFAULT_BITS_PER_COMPONENT
for decode_parm in decode_parms:
if "/Columns" in decode_parm:
columns = decode_parm["/Columns"]
if LZW.BITS_PER_COMPONENT in decode_parm:
bits_per_component = decode_parm[LZW.BITS_PER_COMPONENT]
else:
columns = (
1 if decode_parms is None else decode_parms.get(LZW.COLUMNS, 1)
)
colors = 1 if decode_parms is None else decode_parms.get(LZW.COLORS, 1)
bits_per_component = (
decode_parms.get(LZW.BITS_PER_COMPONENT, DEFAULT_BITS_PER_COMPONENT)
if decode_parms
else DEFAULT_BITS_PER_COMPONENT
try:
colors = cast(int, decode_parms[LZW.COLORS].get_object()) # type: ignore
except (TypeError, KeyError):
colors = 1
try:
bits_per_component = cast(
int,
decode_parms[LZW.BITS_PER_COMPONENT].get_object(), # type: ignore
)
except (TypeError, KeyError):
bits_per_component = DEFAULT_BITS_PER_COMPONENT

# PNG predictor can vary by row and so is the lead byte on each row
rowlength = (
Expand Down Expand Up @@ -259,7 +255,7 @@ class ASCIIHexDecode:
@staticmethod
def decode(
data: Union[str, bytes],
decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
"""
Expand All @@ -278,9 +274,8 @@ def decode(
Raises:
PdfStreamError:
"""
if "decodeParms" in kwargs: # deprecated
deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
decode_parms = kwargs["decodeParms"] # noqa: F841
# decode_parms is unused here

if isinstance(data, str):
data = data.encode()
retval = b""
Expand Down Expand Up @@ -321,7 +316,7 @@ class RunLengthDecode:
@staticmethod
def decode(
data: bytes,
decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
"""
Expand All @@ -337,9 +332,8 @@ def decode(
Raises:
PdfStreamError:
"""
if "decodeParms" in kwargs: # deprecated
deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
decode_parms = kwargs["decodeParms"] # noqa: F841
# decode_parms is unused here

lst = []
index = 0
while True:
Expand Down Expand Up @@ -453,7 +447,7 @@ def decode(self) -> str:
@staticmethod
def decode(
data: bytes,
decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> str:
"""
Expand All @@ -466,9 +460,8 @@ def decode(
Returns:
decoded data.
"""
if "decodeParms" in kwargs: # deprecated
deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
decode_parms = kwargs["decodeParms"] # noqa: F841
# decode_parms is unused here

return LZWDecode.Decoder(data).decode()


Expand All @@ -478,12 +471,11 @@ class ASCII85Decode:
@staticmethod
def decode(
data: Union[str, bytes],
decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
if "decodeParms" in kwargs: # deprecated
deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
decode_parms = kwargs["decodeParms"] # noqa: F841
# decode_parms is unused here

if isinstance(data, str):
data = data.encode("ascii")
group_index = b = 0
Expand Down Expand Up @@ -511,25 +503,21 @@ class DCTDecode:
@staticmethod
def decode(
data: bytes,
decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
if "decodeParms" in kwargs: # deprecated
deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
decode_parms = kwargs["decodeParms"] # noqa: F841
# decode_parms is unused here
return data


class JPXDecode:
@staticmethod
def decode(
data: bytes,
decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
if "decodeParms" in kwargs: # deprecated
deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
decode_parms = kwargs["decodeParms"] # noqa: F841
# decode_parms is unused here
return data


Expand Down Expand Up @@ -591,13 +579,18 @@ def _get_parameters(
@staticmethod
def decode(
data: bytes,
decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
decode_parms: Optional[DictionaryObject] = None,
height: int = 0,
**kwargs: Any,
) -> bytes:
# decode_parms is unused here
if "decodeParms" in kwargs: # deprecated
deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
decode_parms = kwargs["decodeParms"]
if isinstance(decode_parms, ArrayObject): # deprecated
deprecation_no_replacement(
"decode_parms being an ArrayObject", removed_in="3.15.5"
)
parms = CCITTFaxDecode._get_parameters(decode_parms, height)

img_size = len(data)
Expand Down
12 changes: 12 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,18 @@ def test_unixxx_glyphs():
assert pat in txt


@pytest.mark.enable_socket()
def test_cmap_compute_space_width():
# issue 2137
# original file URL:
url = "https://arxiv.org/pdf/2005.05909.pdf"
# URL from github issue is too long to pass code stype check, use original arxiv URL instead
# url = "https://github.com/py-pdf/pypdf/files/12489914/Morris.et.al.-.2020.-.TextAttack.A.Framework.for.Adversarial.Attacks.Data.Augmentation.and.Adversarial.Training.in.NLP.pdf"
name = "TextAttack_paper.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
reader.pages[0].extract_text() # no error


@pytest.mark.enable_socket()
def test_tabs_in_cmap():
"""Issue #2173"""
Expand Down
9 changes: 4 additions & 5 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from PIL import Image

from pypdf import PdfReader
from pypdf.errors import PdfReadError, PdfStreamError
from pypdf.errors import DeprecationError, PdfReadError, PdfStreamError
from pypdf.filters import (
ASCII85Decode,
ASCIIHexDecode,
Expand Down Expand Up @@ -69,16 +69,15 @@ def test_flatedecode_unsupported_predictor():
codec.decode(codec.encode(s), DictionaryObject({"/Predictor": predictor}))


@pytest.mark.parametrize(
"params", [ArrayObject([]), ArrayObject([{"/Predictor": 1}]), "a"]
)
@pytest.mark.parametrize("params", [ArrayObject([]), ArrayObject([{"/Predictor": 1}])])
def test_flate_decode_decompress_with_array_params(params):
"""FlateDecode decode() method works correctly with array parameters."""
codec = FlateDecode()
s = ""
s = s.encode()
encoded = codec.encode(s)
assert codec.decode(encoded, params) == s
with pytest.raises(DeprecationError):
assert codec.decode(encoded, params) == s


@pytest.mark.parametrize(
Expand Down
10 changes: 10 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,6 +1227,16 @@ def create_stamp_pdf() -> BytesIO:
)


def test_merge_with_no_resources():
"""Test for issue #2147"""
writer = PdfWriter()
p0 = writer.add_blank_page(900, 1200)
del p0["/Resources"]
p1 = writer.add_blank_page(900, 1200)
del p1["/Resources"]
writer.pages[0].merge_page(p1)


def test_get_contents_from_nullobject():
"""Issue #2157"""
writer = PdfWriter()
Expand Down
14 changes: 14 additions & 0 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1797,6 +1797,20 @@ def test_viewerpreferences():
assert writer.viewer_preferences is None


def test_extra_spaces_in_da_text(caplog):
writer = PdfWriter(clone_from=RESOURCE_ROOT / "form.pdf")
t = writer.pages[0]["/Annots"][0].get_object()["/DA"]
t = t.replace("/Helv", "/Helv ")
writer.pages[0]["/Annots"][0].get_object()[NameObject("/DA")] = TextStringObject(t)
writer.update_page_form_field_values(
writer.pages[0], {"foo": "abcd"}, auto_regenerate=False
)
t = writer.pages[0]["/Annots"][0].get_object()["/AP"]["/N"].get_data()
assert "Font dictionary for not found." not in caplog.text
assert b"/Helv" in t
assert b"(abcd)" in t


@pytest.mark.enable_socket()
def test_object_contains_indirect_reference_to_self():
url = "https://github.com/py-pdf/pypdf/files/12389243/testbook.pdf"
Expand Down

0 comments on commit 125b4ae

Please sign in to comment.