Skip to content

Commit

Permalink
And more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Jun 16, 2022
1 parent c3f8e37 commit 5d5b2d8
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 5 deletions.
9 changes: 9 additions & 0 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,3 +433,12 @@ def test_read_inline_image_no_has_q():
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


def test_read_inline_image_loc_neg_1():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/935/935066.pdf"
name = "tika-935066.pdf"

reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()
17 changes: 12 additions & 5 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from PyPDF2 import PdfReader, Transformation
from PyPDF2._page import PageObject
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.errors import PdfReadWarning
from PyPDF2.generic import DictionaryObject, NameObject, RectangleObject

from . import get_pdf_from_url
Expand Down Expand Up @@ -245,14 +246,20 @@ def test_extract_text_single_quote_op():
"https://corpora.tika.apache.org/base/docs/govdocs1/932/932446.pdf",
"tika-932446.pdf",
),
# Impossible decode xform:
(
"https://corpora.tika.apache.org/base/docs/govdocs1/972/972962.pdf",
"tika-972962.pdf",
),
],
)
def test_extract_text_page_pdf(url, name):
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


def test_extract_text_page_pdf_impossible_decode_xform():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972962.pdf"
name = "tika-972962.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
with pytest.warns(
PdfReadWarning, match="impossible to decode XFormObject /Meta203"
):
for page in reader.pages:
page.extract_text()

0 comments on commit 5d5b2d8

Please sign in to comment.