From 5a42b96f67c67d2c5b8cbfbc01278f81edcd18bd Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 10 Sep 2023 12:03:20 +0200 Subject: [PATCH 1/5] BUG: Merge pages without resources (#2150) closes #2147 --- pypdf/_page.py | 7 ++++++- tests/test_page.py | 10 ++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 2b238c245b..b351749ea4 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1183,8 +1183,13 @@ def _merge_page_writer( pdf = self.indirect_reference.pdf rename = {} + if PG.RESOURCES not in self: + self[NameObject(PG.RESOURCES)] = DictionaryObject() original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) - page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) + if PG.RESOURCES not in page2: + page2resources = DictionaryObject() + else: + page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) for res in ( RES.EXT_G_STATE, diff --git a/tests/test_page.py b/tests/test_page.py index 7e6faf399f..fb916ea498 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1227,6 +1227,16 @@ def create_stamp_pdf() -> BytesIO: ) +def test_merge_with_no_resources(): + """Test for issue #2147""" + writer = PdfWriter() + p0 = writer.add_blank_page(900, 1200) + del p0["/Resources"] + p1 = writer.add_blank_page(900, 1200) + del p1["/Resources"] + writer.pages[0].merge_page(p1) + + def test_get_contents_from_nullobject(): """Issue #2157""" writer = PdfWriter() From f68138c946bfb24711c75bba631022ccc0604054 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 10 Sep 2023 12:04:44 +0200 Subject: [PATCH 2/5] BUG: Cope with extra space (#2151) Closes #1903 --- pypdf/_writer.py | 1 + tests/test_writer.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 4ef8b9ffa8..25a6444d33 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -862,6 +862,7 @@ def _update_text_field(self, field: DictionaryObject) -> None: # Extract font information da = cast(str, field[AA.DA]) font_properties = da.replace("\n", " ").replace("\r", " ").split(" ") + font_properties = [x for x in font_properties if x != ""] font_name = font_properties[font_properties.index("Tf") - 2] font_height = float(font_properties[font_properties.index("Tf") - 1]) if font_height == 0: diff --git a/tests/test_writer.py b/tests/test_writer.py index 6c4764cb75..c9766f9797 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1788,6 +1788,20 @@ def test_viewerpreferences(): assert writer.viewer_preferences is None +def test_extra_spaces_in_da_text(caplog): + writer = PdfWriter(clone_from=RESOURCE_ROOT / "form.pdf") + t = writer.pages[0]["/Annots"][0].get_object()["/DA"] + t = t.replace("/Helv", "/Helv ") + writer.pages[0]["/Annots"][0].get_object()[NameObject("/DA")] = TextStringObject(t) + writer.update_page_form_field_values( + writer.pages[0], {"foo": "abcd"}, auto_regenerate=False + ) + t = writer.pages[0]["/Annots"][0].get_object()["/AP"]["/N"].get_data() + assert "Font dictionary for not found." not in caplog.text + assert b"/Helv" in t + assert b"(abcd)" in t + + @pytest.mark.enable_socket() def test_object_contains_indirect_reference_to_self(): url = "https://github.com/py-pdf/pypdf/files/12389243/testbook.pdf" From 0ca4d37a01b529377c0af6c72ebc5847b6a3fa45 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 10 Sep 2023 13:49:01 +0200 Subject: [PATCH 3/5] BUG: Cope with indirect objects in filters and remove deprecated code (#2177) closes #2158 closes #2159 --- pypdf/filters.py | 91 ++++++++++++++++++++----------------------- tests/test_filters.py | 9 ++--- 2 files changed, 46 insertions(+), 54 deletions(-) diff --git a/pypdf/filters.py b/pypdf/filters.py index f308a90105..59599e9f1d 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -43,6 +43,7 @@ from ._utils import ( b_, deprecate_with_replacement, + deprecation_no_replacement, logger_warning, ord_, ) @@ -53,7 +54,7 @@ from .constants import ImageAttributes as IA from .constants import LzwFilterParameters as LZW from .constants import StreamAttributes as SA -from .errors import PdfReadError, PdfStreamError +from .errors import DeprecationError, PdfReadError, PdfStreamError from .generic import ( ArrayObject, DictionaryObject, @@ -93,7 +94,7 @@ class FlateDecode: @staticmethod def decode( data: bytes, - decode_parms: Union[None, ArrayObject, DictionaryObject] = None, + decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: """ @@ -113,17 +114,15 @@ def decode( if "decodeParms" in kwargs: # deprecated deprecate_with_replacement("decodeParms", "parameters", "4.0.0") decode_parms = kwargs["decodeParms"] + if isinstance(decode_parms, ArrayObject): # type: ignore + raise DeprecationError("decode_parms as ArrayObject is depreciated") + str_data = decompress(data) predictor = 1 if decode_parms: try: - if isinstance(decode_parms, ArrayObject): - for decode_parm in decode_parms: - if "/Predictor" in decode_parm: - predictor = decode_parm["/Predictor"] - else: - predictor = decode_parms.get("/Predictor", 1) + predictor = decode_parms.get("/Predictor", 1) except (AttributeError, TypeError): # Type Error is NullObject pass # Usually an array with a null object was read # predictor 1 == no predictor @@ -131,24 +130,21 @@ def decode( # The /Columns param. has 1 as the default value; see ISO 32000, # ยง7.4.4.3 LZWDecode and FlateDecode Parameters, Table 8 DEFAULT_BITS_PER_COMPONENT = 8 - if isinstance(decode_parms, ArrayObject): + try: + columns = cast(int, decode_parms[LZW.COLUMNS].get_object()) # type: ignore + except (TypeError, KeyError): columns = 1 - bits_per_component = DEFAULT_BITS_PER_COMPONENT - for decode_parm in decode_parms: - if "/Columns" in decode_parm: - columns = decode_parm["/Columns"] - if LZW.BITS_PER_COMPONENT in decode_parm: - bits_per_component = decode_parm[LZW.BITS_PER_COMPONENT] - else: - columns = ( - 1 if decode_parms is None else decode_parms.get(LZW.COLUMNS, 1) - ) - colors = 1 if decode_parms is None else decode_parms.get(LZW.COLORS, 1) - bits_per_component = ( - decode_parms.get(LZW.BITS_PER_COMPONENT, DEFAULT_BITS_PER_COMPONENT) - if decode_parms - else DEFAULT_BITS_PER_COMPONENT + try: + colors = cast(int, decode_parms[LZW.COLORS].get_object()) # type: ignore + except (TypeError, KeyError): + colors = 1 + try: + bits_per_component = cast( + int, + decode_parms[LZW.BITS_PER_COMPONENT].get_object(), # type: ignore ) + except (TypeError, KeyError): + bits_per_component = DEFAULT_BITS_PER_COMPONENT # PNG predictor can vary by row and so is the lead byte on each row rowlength = ( @@ -259,7 +255,7 @@ class ASCIIHexDecode: @staticmethod def decode( data: Union[str, bytes], - decode_parms: Union[None, ArrayObject, DictionaryObject] = None, + decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: """ @@ -278,9 +274,8 @@ def decode( Raises: PdfStreamError: """ - if "decodeParms" in kwargs: # deprecated - deprecate_with_replacement("decodeParms", "parameters", "4.0.0") - decode_parms = kwargs["decodeParms"] # noqa: F841 + # decode_parms is unused here + if isinstance(data, str): data = data.encode() retval = b"" @@ -321,7 +316,7 @@ class RunLengthDecode: @staticmethod def decode( data: bytes, - decode_parms: Union[None, ArrayObject, DictionaryObject] = None, + decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: """ @@ -337,9 +332,8 @@ def decode( Raises: PdfStreamError: """ - if "decodeParms" in kwargs: # deprecated - deprecate_with_replacement("decodeParms", "parameters", "4.0.0") - decode_parms = kwargs["decodeParms"] # noqa: F841 + # decode_parms is unused here + lst = [] index = 0 while True: @@ -453,7 +447,7 @@ def decode(self) -> str: @staticmethod def decode( data: bytes, - decode_parms: Union[None, ArrayObject, DictionaryObject] = None, + decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> str: """ @@ -466,9 +460,8 @@ def decode( Returns: decoded data. """ - if "decodeParms" in kwargs: # deprecated - deprecate_with_replacement("decodeParms", "parameters", "4.0.0") - decode_parms = kwargs["decodeParms"] # noqa: F841 + # decode_parms is unused here + return LZWDecode.Decoder(data).decode() @@ -478,12 +471,11 @@ class ASCII85Decode: @staticmethod def decode( data: Union[str, bytes], - decode_parms: Union[None, ArrayObject, DictionaryObject] = None, + decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: - if "decodeParms" in kwargs: # deprecated - deprecate_with_replacement("decodeParms", "parameters", "4.0.0") - decode_parms = kwargs["decodeParms"] # noqa: F841 + # decode_parms is unused here + if isinstance(data, str): data = data.encode("ascii") group_index = b = 0 @@ -511,12 +503,10 @@ class DCTDecode: @staticmethod def decode( data: bytes, - decode_parms: Union[None, ArrayObject, DictionaryObject] = None, + decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: - if "decodeParms" in kwargs: # deprecated - deprecate_with_replacement("decodeParms", "parameters", "4.0.0") - decode_parms = kwargs["decodeParms"] # noqa: F841 + # decode_parms is unused here return data @@ -524,12 +514,10 @@ class JPXDecode: @staticmethod def decode( data: bytes, - decode_parms: Union[None, ArrayObject, DictionaryObject] = None, + decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: - if "decodeParms" in kwargs: # deprecated - deprecate_with_replacement("decodeParms", "parameters", "4.0.0") - decode_parms = kwargs["decodeParms"] # noqa: F841 + # decode_parms is unused here return data @@ -591,13 +579,18 @@ def _get_parameters( @staticmethod def decode( data: bytes, - decode_parms: Union[None, ArrayObject, DictionaryObject] = None, + decode_parms: Optional[DictionaryObject] = None, height: int = 0, **kwargs: Any, ) -> bytes: + # decode_parms is unused here if "decodeParms" in kwargs: # deprecated deprecate_with_replacement("decodeParms", "parameters", "4.0.0") decode_parms = kwargs["decodeParms"] + if isinstance(decode_parms, ArrayObject): # deprecated + deprecation_no_replacement( + "decode_parms being an ArrayObject", removed_in="3.15.5" + ) parms = CCITTFaxDecode._get_parameters(decode_parms, height) img_size = len(data) diff --git a/tests/test_filters.py b/tests/test_filters.py index 9268186aad..d2765c86eb 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -10,7 +10,7 @@ from PIL import Image from pypdf import PdfReader -from pypdf.errors import PdfReadError, PdfStreamError +from pypdf.errors import DeprecationError, PdfReadError, PdfStreamError from pypdf.filters import ( ASCII85Decode, ASCIIHexDecode, @@ -69,16 +69,15 @@ def test_flatedecode_unsupported_predictor(): codec.decode(codec.encode(s), DictionaryObject({"/Predictor": predictor})) -@pytest.mark.parametrize( - "params", [ArrayObject([]), ArrayObject([{"/Predictor": 1}]), "a"] -) +@pytest.mark.parametrize("params", [ArrayObject([]), ArrayObject([{"/Predictor": 1}])]) def test_flate_decode_decompress_with_array_params(params): """FlateDecode decode() method works correctly with array parameters.""" codec = FlateDecode() s = "" s = s.encode() encoded = codec.encode(s) - assert codec.decode(encoded, params) == s + with pytest.raises(DeprecationError): + assert codec.decode(encoded, params) == s @pytest.mark.parametrize( From 4657df512fdd840e89b74e7c50105818fedf56f0 Mon Sep 17 00:00:00 2001 From: rchen19 Date: Sun, 10 Sep 2023 06:23:53 -0700 Subject: [PATCH 4/5] BUG: catch the case where w[0] is an IndirectObject instead of an int (#2154) Closes #2137 --- pypdf/_cmap.py | 2 +- tests/test_cmap.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index b09119d3d8..6392805be0 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -410,7 +410,7 @@ def compute_space_width( else: w = [] while len(w) > 0: - st = w[0] + st = w[0] if isinstance(w[0], int) else w[0].get_object() second = w[1].get_object() if isinstance(second, int): for x in range(st, second): diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 262869c94f..9d231cc49a 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -191,6 +191,18 @@ def test_unixxx_glyphs(): assert pat in txt +@pytest.mark.enable_socket() +def test_cmap_compute_space_width(): + # issue 2137 + # original file URL: + url = "https://arxiv.org/pdf/2005.05909.pdf" + # URL from github issue is too long to pass code stype check, use original arxiv URL instead + # url = "https://github.com/py-pdf/pypdf/files/12489914/Morris.et.al.-.2020.-.TextAttack.A.Framework.for.Adversarial.Attacks.Data.Augmentation.and.Adversarial.Training.in.NLP.pdf" + name = "TextAttack_paper.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[0].extract_text() # no error + + @pytest.mark.enable_socket() def test_tabs_in_cmap(): """Issue #2173""" From fb3548538a90ce8cb0773a62131801631ba28276 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 10 Sep 2023 15:26:45 +0200 Subject: [PATCH 5/5] REL: 3.16.0 ## What's new ### Security (SEC) - Infinite recursion caused by IndirectObject clone (#2156) ### New Features (ENH) - Ease access to ViewerPreferences (#2144) ### Bug Fixes (BUG) - catch the case where w[0] is an IndirectObject instead of an int (#2154) - Cope with indirect objects in filters and remove deprecated code (#2177) - Cope with extra space (#2151) - Merge pages without resources (#2150) - getcontents() shall return None if contents is NullObject (#2161) - Fix conversion from 1 to LA (#2175) - Accept tabs in cmaps (#2174) ### Robustness (ROB) - Accept XYZ with no arguments (#2178) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.15.5...3.16.0) --- CHANGELOG.md | 21 +++++++++++++++++++++ pypdf/_version.py | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index faecb9a333..d963ed09f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,26 @@ # CHANGELOG +## Version 3.16.0, 2023-09-10 + +### Security (SEC) +- Infinite recursion caused by IndirectObject clone (#2156) + +### New Features (ENH) +- Ease access to ViewerPreferences (#2144) + +### Bug Fixes (BUG) +- Catch the case where w[0] is an IndirectObject instead of an int (#2154) +- Cope with indirect objects in filters and remove deprecated code (#2177) +- Accept tabs in cmaps (#2174) / cope with extra space (#2151) +- Merge pages without resources (#2150) +- getcontents() shall return None if contents is NullObject (#2161) +- Fix conversion from 1 to LA (#2175) + +### Robustness (ROB) +- Accept XYZ with no arguments (#2178) + +[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.15.5...3.16.0) + ## Version 3.15.5, 2023-09-03 ### Bug Fixes (BUG) diff --git a/pypdf/_version.py b/pypdf/_version.py index 3c95c3c3fe..331093ae1a 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = "3.15.5" +__version__ = "3.16.0"