From c79f0b9ef56fa01572d60eb092bb66ba376f2250 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 30 Jun 2022 23:01:41 +0200 Subject: [PATCH 1/6] TST: Increase test coverage --- PyPDF2/_writer.py | 5 +++- tests/test_merger.py | 13 ++++++++++ tests/test_workflows.py | 57 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 2 deletions(-) diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index 1fb0cb779..62d309402 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -38,6 +38,8 @@ from hashlib import md5 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast +from PyPDF2.errors import PdfReadWarning + from ._page import PageObject, _VirtualList from ._reader import PdfReader from ._security import _alg33, _alg34, _alg35 @@ -934,7 +936,8 @@ def _sweep_indirect_references( # Unable to resolve the Object, returning NullObject instead. warnings.warn( f"Unable to resolve [{data.__class__.__name__}: {data}], " - "returning NullObject instead" + "returning NullObject instead", + PdfReadWarning, ) return NullObject() return newobj diff --git a/tests/test_merger.py b/tests/test_merger.py index fb3f8829c..6104720ef 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -257,3 +257,16 @@ def test_sweep_recursion2(): # cleanup os.remove("tmp-merger-do-not-commit.pdf") + + +def test_foo(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" + name = "tika-924546.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + merger = PdfMerger() + merger.append(reader) + with pytest.warns(UserWarning, match="returning NullObject instead"): + merger.write("tmp-merger-do-not-commit.pdf") + + # cleanup + os.remove("tmp-merger-do-not-commit.pdf") diff --git a/tests/test_workflows.py b/tests/test_workflows.py index fcb8f7f52..d2ff2c0ed 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -5,7 +5,7 @@ import pytest -from PyPDF2 import PdfReader +from PyPDF2 import PdfMerger, PdfReader, PdfWriter from PyPDF2.constants import PageAttributes as PG from PyPDF2.errors import PdfReadWarning @@ -188,3 +188,58 @@ def test_extract_textbench(enable, url, pages, print_result=False): print(f"{rst}\n*****************************\n") except PdfReadWarning: pass + + +@pytest.mark.parametrize( + ("base_path", "overlay_path"), + [ + ( + "resources/crazyones.pdf", + "sample-files/013-reportlab-overlay/reportlab-overlay.pdf", + ), + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/935/935981.pdf", + "sample-files/013-reportlab-overlay/reportlab-overlay.pdf", + ), + ], +) +def test_overlay(base_path, overlay_path): + if base_path.startswith("http"): + base_path = BytesIO(get_pdf_from_url(base_path, name="tika-935981.pdf")) + else: + base_path = os.path.join(PROJECT_ROOT, base_path) + reader = PdfReader(base_path) + writer = PdfWriter() + + reader_overlay = PdfReader(os.path.join(PROJECT_ROOT, overlay_path)) + overlay = reader_overlay.pages[0] + + for page in reader.pages: + page.merge_page(overlay) + writer.add_page(page) + with open("dont_commit_overlay.pdf", "wb") as fp: + writer.write(fp) + + # Cleanup + os.remove("dont_commit_overlay.pdf") + + +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf", + "tika-924546.pdf", + ) + ], +) +def test_merge(url, name): + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + merger = PdfMerger() + merger.append(reader) + # This could actually be a performance bottleneck: + with pytest.warns( + PdfReadWarning, match="^Unable to resolve .*, returning NullObject instead" + ): + merger.write("tmp.merged.pdf") From 07994fc6f700cfab005a680db763e92f513d59fd Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 30 Jun 2022 23:09:02 +0200 Subject: [PATCH 2/6] Sample-files --- sample-files | 2 +- tests/test_merger.py | 32 +++++++++++++++----------------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/sample-files b/sample-files index 4d24ff93d..6da0fbb53 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit 4d24ff93dcddf21d55d028d9675d5b5bf9d7a350 +Subproject commit 6da0fbb53f11bd5b8a4acf06e4d26e5e2bf5bf57 diff --git a/tests/test_merger.py b/tests/test_merger.py index 6104720ef..442d636f6 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -245,23 +245,21 @@ def test_sweep_recursion1(): os.remove("tmp-merger-do-not-commit.pdf") -def test_sweep_recursion2(): - # TODO: This test looks like an infinite loop. - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924794.pdf" - name = "tika-924794.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - merger = PdfMerger() - merger.append(reader) - with pytest.warns(UserWarning, match="returning NullObject instead"): - merger.write("tmp-merger-do-not-commit.pdf") - - # cleanup - os.remove("tmp-merger-do-not-commit.pdf") - - -def test_foo(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" - name = "tika-924546.pdf" +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + # TODO: This test looks like an infinite loop. + "https://corpora.tika.apache.org/base/docs/govdocs1/924/924794.pdf", + "tika-924794.pdf", + ), + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf", + "tika-924546.pdf", + ), + ], +) +def test_sweep_recursion2(url, name): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) From a768e07d6ef47553e11d36edaacefca4d37d1a41 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 30 Jun 2022 23:20:10 +0200 Subject: [PATCH 3/6] Metadata test --- tests/test_workflows.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index d2ff2c0ed..b3e230325 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -243,3 +243,18 @@ def test_merge(url, name): PdfReadWarning, match="^Unable to resolve .*, returning NullObject instead" ): merger.write("tmp.merged.pdf") + + +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/935/935996.pdf", + "tika-935996.pdf", + ) + ], +) +def test_get_metadata(url, name): + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + reader.metadata From 797439a18b8bcd1c333d3116cd2556fa1481eea4 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 30 Jun 2022 23:22:20 +0200 Subject: [PATCH 4/6] Text extraction test --- tests/test_workflows.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index b3e230325..6742b7b90 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -258,3 +258,18 @@ def test_get_metadata(url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) reader.metadata + + +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf", + "tika-938702.pdf", + ) + ], +) +def test_extract_text(url, name): + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + reader.metadata From 955d30fc6847f242cbe6466a07b5e38b1bb02a24 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 30 Jun 2022 23:26:50 +0200 Subject: [PATCH 5/6] Add compression test --- tests/test_workflows.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 6742b7b90..61468e99a 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -7,7 +7,7 @@ from PyPDF2 import PdfMerger, PdfReader, PdfWriter from PyPDF2.constants import PageAttributes as PG -from PyPDF2.errors import PdfReadWarning +from PyPDF2.errors import PdfReadError, PdfReadWarning from . import get_pdf_from_url @@ -273,3 +273,23 @@ def test_extract_text(url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) reader.metadata + + +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf", + "tika-938702.pdf", + ) + ], +) +def test_compress(url, name): + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + # TODO: which page exactly? + # TODO: Is it reasonable to have an exception here? + with pytest.raises(PdfReadError) as exc: + for page in reader.pages: + page.compress_content_streams() + assert exc.value.args[0] == "Unexpected end of stream" From 62be24cd9b76b0399fb8f900b95e58f8c2cf16e0 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 3 Jul 2022 16:23:32 +0200 Subject: [PATCH 6/6] Add more tests --- tests/test_workflows.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 61468e99a..6340fc0e0 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -233,7 +233,7 @@ def test_overlay(base_path, overlay_path): ) ], ) -def test_merge(url, name): +def test_merge_with_warning(url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) merger = PdfMerger() @@ -245,6 +245,23 @@ def test_merge(url, name): merger.write("tmp.merged.pdf") +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/980/980613.pdf", + "tika-980613.pdf", + ) + ], +) +def test_merge(url, name): + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + merger = PdfMerger() + merger.append(reader) + merger.write("tmp.merged.pdf") + + @pytest.mark.parametrize( ("url", "name"), [ @@ -293,3 +310,18 @@ def test_compress(url, name): for page in reader.pages: page.compress_content_streams() assert exc.value.args[0] == "Unexpected end of stream" + + +def test_get_fields(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/961/961883.pdf" + name = "tika-961883.pdf" + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data) + with open("tmp.txt", "w") as fp: + with pytest.warns(PdfReadWarning, match="Object 2 0 not defined."): + retrieved_fields = reader.get_fields(fileobj=fp) + + assert retrieved_fields == {} + + # Cleanup + os.remove("tmp.txt")