diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index 1fb0cb779..62d309402 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -38,6 +38,8 @@ from hashlib import md5 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast +from PyPDF2.errors import PdfReadWarning + from ._page import PageObject, _VirtualList from ._reader import PdfReader from ._security import _alg33, _alg34, _alg35 @@ -934,7 +936,8 @@ def _sweep_indirect_references( # Unable to resolve the Object, returning NullObject instead. warnings.warn( f"Unable to resolve [{data.__class__.__name__}: {data}], " - "returning NullObject instead" + "returning NullObject instead", + PdfReadWarning, ) return NullObject() return newobj diff --git a/sample-files b/sample-files index 4d24ff93d..6da0fbb53 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit 4d24ff93dcddf21d55d028d9675d5b5bf9d7a350 +Subproject commit 6da0fbb53f11bd5b8a4acf06e4d26e5e2bf5bf57 diff --git a/tests/test_merger.py b/tests/test_merger.py index fb3f8829c..442d636f6 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -245,10 +245,21 @@ def test_sweep_recursion1(): os.remove("tmp-merger-do-not-commit.pdf") -def test_sweep_recursion2(): - # TODO: This test looks like an infinite loop. - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924794.pdf" - name = "tika-924794.pdf" +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + # TODO: This test looks like an infinite loop. + "https://corpora.tika.apache.org/base/docs/govdocs1/924/924794.pdf", + "tika-924794.pdf", + ), + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf", + "tika-924546.pdf", + ), + ], +) +def test_sweep_recursion2(url, name): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index fcb8f7f52..6340fc0e0 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -5,9 +5,9 @@ import pytest -from PyPDF2 import PdfReader +from PyPDF2 import PdfMerger, PdfReader, PdfWriter from PyPDF2.constants import PageAttributes as PG -from PyPDF2.errors import PdfReadWarning +from PyPDF2.errors import PdfReadError, PdfReadWarning from . import get_pdf_from_url @@ -188,3 +188,140 @@ def test_extract_textbench(enable, url, pages, print_result=False): print(f"{rst}\n*****************************\n") except PdfReadWarning: pass + + +@pytest.mark.parametrize( + ("base_path", "overlay_path"), + [ + ( + "resources/crazyones.pdf", + "sample-files/013-reportlab-overlay/reportlab-overlay.pdf", + ), + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/935/935981.pdf", + "sample-files/013-reportlab-overlay/reportlab-overlay.pdf", + ), + ], +) +def test_overlay(base_path, overlay_path): + if base_path.startswith("http"): + base_path = BytesIO(get_pdf_from_url(base_path, name="tika-935981.pdf")) + else: + base_path = os.path.join(PROJECT_ROOT, base_path) + reader = PdfReader(base_path) + writer = PdfWriter() + + reader_overlay = PdfReader(os.path.join(PROJECT_ROOT, overlay_path)) + overlay = reader_overlay.pages[0] + + for page in reader.pages: + page.merge_page(overlay) + writer.add_page(page) + with open("dont_commit_overlay.pdf", "wb") as fp: + writer.write(fp) + + # Cleanup + os.remove("dont_commit_overlay.pdf") + + +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf", + "tika-924546.pdf", + ) + ], +) +def test_merge_with_warning(url, name): + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + merger = PdfMerger() + merger.append(reader) + # This could actually be a performance bottleneck: + with pytest.warns( + PdfReadWarning, match="^Unable to resolve .*, returning NullObject instead" + ): + merger.write("tmp.merged.pdf") + + +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/980/980613.pdf", + "tika-980613.pdf", + ) + ], +) +def test_merge(url, name): + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + merger = PdfMerger() + merger.append(reader) + merger.write("tmp.merged.pdf") + + +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/935/935996.pdf", + "tika-935996.pdf", + ) + ], +) +def test_get_metadata(url, name): + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + reader.metadata + + +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf", + "tika-938702.pdf", + ) + ], +) +def test_extract_text(url, name): + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + reader.metadata + + +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf", + "tika-938702.pdf", + ) + ], +) +def test_compress(url, name): + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + # TODO: which page exactly? + # TODO: Is it reasonable to have an exception here? + with pytest.raises(PdfReadError) as exc: + for page in reader.pages: + page.compress_content_streams() + assert exc.value.args[0] == "Unexpected end of stream" + + +def test_get_fields(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/961/961883.pdf" + name = "tika-961883.pdf" + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + with open("tmp.txt", "w") as fp: + with pytest.warns(PdfReadWarning, match="Object 2 0 not defined."): + retrieved_fields = reader.get_fields(fileobj=fp) + + assert retrieved_fields == {} + + # Cleanup + os.remove("tmp.txt")