From df8f12127aad2c2faa97615e6cd8a766b0ca5ed3 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 11 Jun 2022 14:22:23 +0200 Subject: [PATCH 1/8] BUG: Adobe Acrobat 'Would you like to save this file?' (#970) Issue: When creating files with the current PpdfWriter, Adobe Acrobat asks 'would you like to save this file' when attempting to close it - although no changes were made. Fix: Remove 'self.set_need_appearances_writer()' from writers __init__ function Caused-by: #412 (see #355) Closes #963 Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> --- PyPDF2/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index e38af4d74..a070b3387 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -124,7 +124,6 @@ def __init__(self) -> None: ) self._root: Optional[IndirectObject] = None self._root_object = root - self.set_need_appearances_writer() def _add_object(self, obj: Optional[PdfObject]) -> IndirectObject: self._objects.append(obj) @@ -539,6 +538,7 @@ def update_page_form_field_values( second bit sets Required, the third bit sets NoExport. See PDF Reference Table 8.70 for details. """ + self.set_need_appearances_writer() # Iterate through pages, update field values for j in range(len(page[PG.ANNOTS])): # type: ignore writer_annot = page[PG.ANNOTS][j].get_object() # type: ignore From 55f7c7b35be4431336a8c5a769a7c261445bd47a Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 12 Jun 2022 10:18:09 +0200 Subject: [PATCH 2/8] STY: Use more tuples and list/dict comprehensions (#976) --- PyPDF2/_page.py | 16 ++++++++-------- PyPDF2/_reader.py | 9 ++++----- PyPDF2/_writer.py | 12 ++++++------ PyPDF2/constants.py | 4 ++-- PyPDF2/papersizes.py | 4 ++-- 5 files changed, 22 insertions(+), 23 deletions(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index ae44506e6..61227be11 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -548,13 +548,13 @@ def _merge_page( def _expand_mediabox( self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] ) -> None: - corners1 = [ + corners1 = ( self.mediabox.left.as_numeric(), self.mediabox.bottom.as_numeric(), self.mediabox.right.as_numeric(), self.mediabox.top.as_numeric(), - ] - corners2 = [ + ) + corners2 = ( page2.mediabox.left.as_numeric(), page2.mediabox.bottom.as_numeric(), page2.mediabox.left.as_numeric(), @@ -563,17 +563,17 @@ def _expand_mediabox( page2.mediabox.top.as_numeric(), page2.mediabox.right.as_numeric(), page2.mediabox.bottom.as_numeric(), - ] + ) if ctm is not None: ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] - new_x = [ + new_x = tuple( ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] for i in range(0, 8, 2) - ] - new_y = [ + ) + new_y = tuple( ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] for i in range(0, 8, 2) - ] + ) else: new_x = corners2[0:8:2] new_y = corners2[1:8:2] diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 9babe7195..acfc1ca32 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -525,7 +525,7 @@ def _check_kids( self.get_fields(kid.get_object(), retval, fileobj) def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: - order = ["/TM", "/T", "/FT", PA.PARENT, "/TU", "/Ff", "/V", "/DV"] + order = ("/TM", "/T", "/FT", PA.PARENT, "/TU", "/Ff", "/V", "/DV") for attr in order: attr_name = field_attributes[attr] try: @@ -701,10 +701,9 @@ def _get_page_number_by_indirect( ) -> int: """Generate _page_id2num""" if self._page_id2num is None: - id2num = {} - for i, x in enumerate(self.pages): - id2num[x.indirect_ref.idnum] = i # type: ignore - self._page_id2num = id2num + self._page_id2num = { + x.indirect_ref.idnum: i for i, x in enumerate(self.pages) # type: ignore + } if indirect_ref is None or isinstance(indirect_ref, NullObject): return -1 diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index a070b3387..c0528eeaa 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -1209,7 +1209,7 @@ def remove_images(self, ignore_byte_string_object: bool = False) -> None: """ pg_dict = cast(DictionaryObject, self.get_object(self._pages)) pages = cast(ArrayObject, pg_dict[PA.KIDS]) - jump_operators = [ + jump_operators = ( b_("cm"), b_("w"), b_("J"), @@ -1235,7 +1235,7 @@ def remove_images(self, ignore_byte_string_object: bool = False) -> None: b_("B"), b_("Do"), b_("sh"), - ] + ) for j in range(len(pages)): page = pages[j] page_ref = cast(DictionaryObject, self.get_object(page)) @@ -1539,7 +1539,7 @@ def addLink( # pragma: no cover deprecate_with_replacement("addLink", "add_link") return self.add_link(pagenum, pagedest, rect, border, fit, *args) - _valid_layouts = [ + _valid_layouts = ( "/NoLayout", "/SinglePage", "/OneColumn", @@ -1547,7 +1547,7 @@ def addLink( # pragma: no cover "/TwoColumnRight", "/TwoPageLeft", "/TwoPageRight", - ] + ) def _get_page_layout(self) -> Optional[LayoutType]: try: @@ -1656,14 +1656,14 @@ def pageLayout(self, layout: LayoutType) -> None: # pragma: no cover deprecate_with_replacement("pageLayout", "page_layout") self.page_layout = layout - _valid_modes = [ + _valid_modes = ( "/UseNone", "/UseOutlines", "/UseThumbs", "/FullScreen", "/UseOC", "/UseAttachments", - ] + ) def _get_page_mode(self) -> Optional[PagemodeType]: try: diff --git a/PyPDF2/constants.py b/PyPDF2/constants.py index a62aea1a4..4a6be917e 100644 --- a/PyPDF2/constants.py +++ b/PyPDF2/constants.py @@ -285,7 +285,7 @@ class CatalogDictionary: NEEDS_RENDERING = "/NeedsRendering" # boolean, optional -PDF_KEYS = [ +PDF_KEYS = ( PagesAttributes, PageAttributes, Ressources, @@ -302,4 +302,4 @@ class CatalogDictionary: Core, TrailerKeys, CatalogAttributes, -] +) diff --git a/PyPDF2/papersizes.py b/PyPDF2/papersizes.py index 5ba49d698..51aa2de59 100644 --- a/PyPDF2/papersizes.py +++ b/PyPDF2/papersizes.py @@ -35,7 +35,7 @@ class PaperSize: C4 = Dimensions(649, 918) -_din_a = [ +_din_a = ( PaperSize.A0, PaperSize.A1, PaperSize.A2, @@ -45,4 +45,4 @@ class PaperSize: PaperSize.A6, PaperSize.A7, PaperSize.A8, -] +) From 084745fc51f105f01282c879ed6a18b5b15acbc5 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 12 Jun 2022 10:29:47 +0200 Subject: [PATCH 3/8] MAINT: pre-commit / requirements.txt updates (#977) --- .isort.cfg | 2 +- .pre-commit-config.yaml | 8 ++------ requirements/ci.txt | 10 +++++----- requirements/dev.txt | 10 +++++----- requirements/docs.txt | 4 ++-- 5 files changed, 15 insertions(+), 19 deletions(-) diff --git a/.isort.cfg b/.isort.cfg index bd8942df2..7a50801f4 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -4,4 +4,4 @@ indent=' ' multi_line_output=3 length_sort=0 include_trailing_comma=True -known_third_party = dataclasses,pytest,setuptools +known_third_party = pytest,setuptools diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 699f0fe49..9b473c459 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ # pre-commit run --all-files repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.1.0 + rev: v4.3.0 hooks: - id: check-ast - id: check-byte-order-marker @@ -24,10 +24,6 @@ repos: # rev: v0.942 # hooks: # - id: mypy -- repo: https://github.com/asottile/seed-isort-config - rev: v2.2.0 - hooks: - - id: seed-isort-config - repo: https://github.com/pre-commit/mirrors-isort rev: v5.10.1 hooks: @@ -48,7 +44,7 @@ repos: - id: blacken-docs additional_dependencies: [black==22.1.0] - repo: https://github.com/asottile/pyupgrade - rev: v2.31.0 + rev: v2.34.0 hooks: - id: pyupgrade args: [--py36-plus] diff --git a/requirements/ci.txt b/requirements/ci.txt index 5a63727fe..edfd571f2 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -28,9 +28,9 @@ iniconfig==1.1.1 # via pytest mccabe==0.6.1 # via flake8 -more-itertools==8.12.0 +more-itertools==8.13.0 # via flake8-implicit-str-concat -mypy==0.950 +mypy==0.961 # via -r requirements/ci.in mypy-extensions==0.4.3 # via mypy @@ -48,7 +48,7 @@ pycodestyle==2.8.0 # via flake8 pyflakes==2.4.0 # via flake8 -pyparsing==3.0.8 +pyparsing==3.0.9 # via packaging pytest==7.0.1 # via @@ -60,11 +60,11 @@ tomli==1.2.3 # via # mypy # pytest -typed-ast==1.5.3 +typed-ast==1.5.4 # via mypy typeguard==2.13.3 # via -r requirements/ci.in -types-pillow==9.0.14 +types-pillow==9.0.19 # via -r requirements/ci.in typing-extensions==4.1.1 # via diff --git a/requirements/dev.txt b/requirements/dev.txt index be07e211e..273492207 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -10,7 +10,7 @@ black==22.3.0 # via -r requirements/dev.in bleach==4.1.0 # via readme-renderer -certifi==2021.10.8 +certifi==2022.5.18.1 # via requests cffi==1.15.0 # via cryptography @@ -26,7 +26,7 @@ colorama==0.4.4 # via twine coverage[toml]==6.2 # via pytest-cov -cryptography==37.0.1 +cryptography==37.0.2 # via secretstorage dataclasses==0.8 # via black @@ -77,7 +77,7 @@ pep517==0.12.0 # via pip-tools pip-tools==6.4.0 # via -r requirements/dev.in -pkginfo==1.8.2 +pkginfo==1.8.3 # via twine platformdirs==2.4.0 # via @@ -93,7 +93,7 @@ pycparser==2.21 # via cffi pygments==2.12.0 # via readme-renderer -pyparsing==3.0.8 +pyparsing==3.0.9 # via packaging pytest==7.0.1 # via pytest-cov @@ -129,7 +129,7 @@ tqdm==4.64.0 # via twine twine==3.8.0 # via -r requirements/dev.in -typed-ast==1.5.3 +typed-ast==1.5.4 # via black typing-extensions==4.1.1 # via diff --git a/requirements/docs.txt b/requirements/docs.txt index 5e42a093d..afe10d6a3 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -10,7 +10,7 @@ attrs==21.4.0 # via markdown-it-py babel==2.10.1 # via sphinx -certifi==2021.10.8 +certifi==2022.5.18.1 # via requests charset-normalizer==2.0.12 # via requests @@ -45,7 +45,7 @@ packaging==21.3 # via sphinx pygments==2.12.0 # via sphinx -pyparsing==3.0.8 +pyparsing==3.0.9 # via packaging pytz==2022.1 # via babel From 8149026b0b7e2dbb328aff78fd674bbdc7bbc3b8 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 12 Jun 2022 13:11:39 +0200 Subject: [PATCH 4/8] ENH: Add support for pathlib as input for PdfReader (#979) --- PyPDF2/_reader.py | 5 +++-- tests/test_reader.py | 13 ++++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index acfc1ca32..2b1c2c152 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -33,6 +33,7 @@ import warnings from hashlib import md5 from io import BytesIO +from pathlib import Path from typing import ( Any, Callable, @@ -234,7 +235,7 @@ class PdfReader: def __init__( self, - stream: StrByteType, + stream: Union[StrByteType, Path], strict: bool = False, password: Union[None, str, bytes] = None, ) -> None: @@ -251,7 +252,7 @@ def __init__( "It may not be read correctly.", PdfReadWarning, ) - if isinstance(stream, str): + if isinstance(stream, (str, Path)): with open(stream, "rb") as fh: stream = BytesIO(b_(fh.read())) self.read(stream) diff --git a/tests/test_reader.py b/tests/test_reader.py index a91035103..b33425555 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -2,6 +2,7 @@ import os import time from io import BytesIO +from pathlib import Path import pytest @@ -10,7 +11,11 @@ from PyPDF2.constants import ImageAttributes as IA from PyPDF2.constants import PageAttributes as PG from PyPDF2.constants import Ressources as RES -from PyPDF2.errors import PdfReadError, PdfReadWarning, STREAM_TRUNCATED_PREMATURELY +from PyPDF2.errors import ( + STREAM_TRUNCATED_PREMATURELY, + PdfReadError, + PdfReadWarning, +) from PyPDF2.filters import _xobj_to_image from tests import get_pdf_from_url @@ -691,3 +696,9 @@ def test_extract_text_hello_world(): "Japanese:", "こんにちは世界", ] + + +def test_read_path(): + path = Path(os.path.join(RESOURCE_ROOT, "crazyones.pdf")) + reader = PdfReader(path) + assert len(reader.pages) == 1 From 41eff2a059ee4f4a9cbca931a110e65a3521bcd7 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 12 Jun 2022 14:02:45 +0200 Subject: [PATCH 5/8] TST: Add MCVE of issue #416 (#980) --- tests/test_reader.py | 10 ++++++++++ tests/test_utils.py | 1 + 2 files changed, 11 insertions(+) diff --git a/tests/test_reader.py b/tests/test_reader.py index b33425555..ca2c02ea4 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -702,3 +702,13 @@ def test_read_path(): path = Path(os.path.join(RESOURCE_ROOT, "crazyones.pdf")) reader = PdfReader(path) assert len(reader.pages) == 1 + + +@pytest.mark.xfail(reason="#416") +def test_read_form_416(): + url = ( + "https://www.fda.gov/downloads/AboutFDA/ReportsManualsForms/Forms/UCM074728.pdf" + ) + reader = PdfReader(BytesIO(get_pdf_from_url(url, name="issue_416.pdf"))) + fields = reader.get_form_text_fields() + assert len(fields) > 0 diff --git a/tests/test_utils.py b/tests/test_utils.py index 627ee9fbb..6b557b87a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -201,6 +201,7 @@ def test_read_block_backwards_at_start(): 6, ), ], + ids=list(range(11)), ) def test_read_previous_line(dat, pos, expected, expected_pos): s = io.BytesIO(dat) From 363372c11005a0ff1dd72b4ff66b5097987d3df6 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 12 Jun 2022 18:04:43 +0200 Subject: [PATCH 6/8] DOC: Notes on annotations (#982) --- docs/user/reading-pdf-annotations.md | 15 +++++++++++++++ tests/test_filters.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/docs/user/reading-pdf-annotations.md b/docs/user/reading-pdf-annotations.md index 8c01fe728..fc87377d3 100644 --- a/docs/user/reading-pdf-annotations.md +++ b/docs/user/reading-pdf-annotations.md @@ -16,6 +16,21 @@ PDF 1.7 defines 25 different annotation types: * Watermark * 3D +In general, annotations can be read like this: + +```python +from PyPDF2 import PdfReader + +reader = PdfReader("commented.pdf") + +for page in reader.pages: + if "/Annots" in page: + for annot in page["/Annots"]: + obj = annot.get_object() + annotation = {"subtype": obj["/Subtype"], "location": obj["/Rect"]} + print(annotation) +``` + Reading the most common ones is described here. ## Text diff --git a/tests/test_filters.py b/tests/test_filters.py index e34c5e05f..32dd5e642 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -103,7 +103,7 @@ def test_ASCIIHexDecode(data, expected): """ Feeds a bunch of values to ASCIIHexDecode.decode() and ensures the correct output is returned. - TO-DO What is decode() supposed to do for such inputs as ">>", ">>>" or + TODO What is decode() supposed to do for such inputs as ">>", ">>>" or any other not terminated by ">"? (For the latter case, an exception is currently raised.) """ From a15cf67173a81523ff3e9eab3ac5ee98429841c2 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 12 Jun 2022 18:08:30 +0200 Subject: [PATCH 7/8] DEV: Add PI to make_changelog --- make_changelog.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/make_changelog.py b/make_changelog.py index 32b36e84c..c280fc6cd 100644 --- a/make_changelog.py +++ b/make_changelog.py @@ -1,11 +1,10 @@ """Internal tool to update the changelog.""" import subprocess +from dataclasses import dataclass from datetime import datetime from typing import List -from dataclasses import dataclass - @dataclass(frozen=True) class Change: @@ -63,7 +62,7 @@ def get_formatted_changes(git_tag: str) -> str: grouped[commit.prefix].append({"msg": commit.message}) # Order prefixes - order = ["DEP", "ENH", "BUG", "ROB", "DOC", "DEV", "MAINT", "TST", "STY"] + order = ["DEP", "ENH", "PI", "BUG", "ROB", "DOC", "DEV", "MAINT", "TST", "STY"] abbrev2long = { "DEP": "Deprecations", "ENH": "New Features", @@ -74,6 +73,7 @@ def get_formatted_changes(git_tag: str) -> str: "MAINT": "Maintenance", "TST": "Testing", "STY": "Code Style", + "PI": "Performance Improvements", } # Create output From 9c4e7f52fb3c53ed6391d4a96e227116a9473acf Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 12 Jun 2022 18:09:37 +0200 Subject: [PATCH 8/8] REL: 2.1.1 New Features (ENH): - Add support for pathlib as input for PdfReader (#979) Performance Improvements (PI): - Optimize read_next_end_line (#646) Bug Fixes (BUG): - Adobe Acrobat \'Would you like to save this file?\' (#970) Documentation (DOC): - Notes on annotations (#982) - Who uses PyPDF2 - intendet \xe2\x9e\x94 in robustness page (#958) Maintenance (MAINT): - pre-commit / requirements.txt updates (#977) - Mark read_next_end_line as deprecated (#965) - Export `PageObject` in PyPDF2 root (#960) Testing (TST): - Add MCVE of issue #416 (#980) - FlateDecode.decode decodeParms (#964) - Xmp module (#962) - utils.paeth_predictor (#959) Code Style (STY): - Use more tuples and list/dict comprehensions (#976) Full Changelog: https://github.com/py-pdf/PyPDF2/compare/2.1.0...2.1.1 --- CHANGELOG | 34 ++++++++++++++++++++++++++++++++++ PyPDF2/_version.py | 2 +- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index 5a5cbc0b9..c6c9f52d7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,37 @@ +Version 2.1.1, 2022-06-12 +------------------------- + +New Features (ENH): +- Add support for pathlib as input for PdfReader (#979) + +Performance Improvements (PI): +- Optimize read_next_end_line (#646) + +Bug Fixes (BUG): +- Adobe Acrobat \'Would you like to save this file?\' (#970) + +Documentation (DOC): +- Notes on annotations (#982) +- Who uses PyPDF2 +- intendet \xe2\x9e\x94 in robustness page (#958) + +Maintenance (MAINT): +- pre-commit / requirements.txt updates (#977) +- Mark read_next_end_line as deprecated (#965) +- Export `PageObject` in PyPDF2 root (#960) + +Testing (TST): +- Add MCVE of issue #416 (#980) +- FlateDecode.decode decodeParms (#964) +- Xmp module (#962) +- utils.paeth_predictor (#959) + +Code Style (STY): +- Use more tuples and list/dict comprehensions (#976) + +Full Changelog: https://github.com/py-pdf/PyPDF2/compare/2.1.0...2.1.1 + + Version 2.1.0, 2022-06-06 ------------------------- diff --git a/PyPDF2/_version.py b/PyPDF2/_version.py index 9aa3f9036..58039f505 100644 --- a/PyPDF2/_version.py +++ b/PyPDF2/_version.py @@ -1 +1 @@ -__version__ = "2.1.0" +__version__ = "2.1.1"