Skip to content

Commit

Permalink
TST: Add workflow tests (#1287)
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma authored Aug 27, 2022
1 parent e909d8c commit ceb997d
Showing 1 changed file with 60 additions and 7 deletions.
67 changes: 60 additions & 7 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,42 +418,88 @@ def test_get_metadata(url, name):


@pytest.mark.parametrize(
("url", "name"),
("url", "name", "strict", "exception"),
[
(
"https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf",
"tika-938702.pdf",
False,
(PdfReadError, "Unexpected end of stream"),
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/942/942358.pdf",
"tika-942358.pdf",
False,
None,
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/911/911260.pdf",
"tika-911260.pdf",
False,
None,
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/992/992472.pdf",
"tika-992472.pdf",
False,
None,
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/978/978477.pdf",
"tika-978477.pdf",
False,
None,
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/960/960317.pdf",
"tika-960317.pdf",
False,
None,
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/930/930513.pdf",
"tika-930513.pdf",
False,
None,
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/918/918113.pdf",
"tika-918113.pdf",
True,
None,
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/940/940704.pdf",
"tika-940704.pdf",
True,
None,
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/976/976488.pdf",
"tika-976488.pdf",
True,
None,
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/948/948176.pdf",
"tika-948176.pdf",
True,
None,
),
],
)
def test_extract_text(url, name):
def test_extract_text(url, name, strict, exception):
data = BytesIO(get_pdf_from_url(url, name=name))
reader = PdfReader(data)
reader.metadata
reader = PdfReader(data, strict=strict)
if not exception:
for page in reader.pages:
page.extract_text()
else:
exc, exc_text = exception
with pytest.raises(exc) as ex_info:
for page in reader.pages:
page.extract_text()
assert ex_info.value.args[0] == exc_text


@pytest.mark.parametrize(
Expand Down Expand Up @@ -481,21 +527,28 @@ def test_compress_raised(url, name):


@pytest.mark.parametrize(
("url", "name"),
("url", "name", "strict"),
[
(
"https://corpora.tika.apache.org/base/docs/govdocs1/915/915194.pdf",
"tika-915194.pdf",
False,
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/950/950337.pdf",
"tika-950337.pdf",
False,
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/962/962292.pdf",
"tika-962292.pdf",
True,
),
],
)
def test_compress(url, name):
def test_compress(url, name, strict):
data = BytesIO(get_pdf_from_url(url, name=name))
reader = PdfReader(data)
reader = PdfReader(data, strict=strict)
# TODO: which page exactly?
# TODO: Is it reasonable to have an exception here?
for page in reader.pages:
Expand Down

0 comments on commit ceb997d

Please sign in to comment.