From 42c358d9e2a0801df0b5e9be9dd6e5cabb2814f6 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 16 Jun 2022 21:56:06 +0200 Subject: [PATCH 1/3] TST: reader.get_fields --- tests/test_generic.py | 9 +++++++++ tests/test_reader.py | 10 ++++++++++ 2 files changed, 19 insertions(+) diff --git a/tests/test_generic.py b/tests/test_generic.py index 0b34af86d..77c5f82b4 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -442,3 +442,12 @@ def test_read_inline_image_loc_neg_1(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) for page in reader.pages: page.extract_text() + + +def test_text_string_write_to_stream(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924562.pdf" + name = "tika-924562.pdf" + + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + for page in reader.pages: + page.compress_content_streams() diff --git a/tests/test_reader.py b/tests/test_reader.py index f161e5737..43215ab81 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -749,3 +749,13 @@ def test_extract_text_pdf15(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-976030.pdf"))) for page in reader.pages: page.extract_text() + + +def test_get_fields(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972486.pdf" + name = "tika-972486.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + fields = reader.get_fields() + assert fields is not None + assert "c1-1" in fields + assert dict(fields["c1-1"]) == ({"/FT": "/Btn", "/T": "c1-1"}) From de311aa82608b180ff868a53653fdcb2411bdb7f Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 16 Jun 2022 22:22:42 +0200 Subject: [PATCH 2/3] More tests --- tests/test_reader.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_reader.py b/tests/test_reader.py index 43215ab81..6aabd10af 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -759,3 +759,27 @@ def test_get_fields(): assert fields is not None assert "c1-1" in fields assert dict(fields["c1-1"]) == ({"/FT": "/Btn", "/T": "c1-1"}) + + +def test_get_fields_read_else_block(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/934/934771.pdf" + name = "tika-934771.pdf" + with pytest.raises(PdfReadError) as exc: + PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + assert exc.value.args[0] == "Could not find xref table at specified location" + + +def test_get_fields_read_else_block(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/914/914902.pdf" + name = "tika-914902.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + fields = reader.get_fields() + assert fields is None + + +def test_get_fields_read_else_block(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/957/957721.pdf" + name = "tika-957721.pdf" + with pytest.raises(PdfReadError) as exc: + PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + assert exc.value.args[0] == "Could not find xref table at specified location" From e3744844c12e25a4804bc68da3d5cd34ad590047 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 16 Jun 2022 22:27:50 +0200 Subject: [PATCH 3/3] More tests --- tests/test_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 6aabd10af..8362ec094 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -769,7 +769,7 @@ def test_get_fields_read_else_block(): assert exc.value.args[0] == "Could not find xref table at specified location" -def test_get_fields_read_else_block(): +def test_get_fields_read_else_block2(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/914/914902.pdf" name = "tika-914902.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) @@ -777,7 +777,7 @@ def test_get_fields_read_else_block(): assert fields is None -def test_get_fields_read_else_block(): +def test_get_fields_read_else_block3(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/957/957721.pdf" name = "tika-957721.pdf" with pytest.raises(PdfReadError) as exc: