Skip to content

Commit

Permalink
TST: Parametrize test_cmap_encodings (#1823)
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma authored May 1, 2023
1 parent dde4c79 commit a04b65b
Showing 1 changed file with 36 additions and 17 deletions.
53 changes: 36 additions & 17 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,20 +109,28 @@ def test_ascii_charset():

@pytest.mark.enable_socket()
@pytest.mark.parametrize(
("url", "name", "page_nb"),
("url", "name", "page_nb", "within_text"),
[
(
"https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf",
"cmap1370.pdf",
0,
"",
),
(
"https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf",
"02voc.pdf",
2,
"Document delineation and character sequence decoding",
),
("https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf", "02voc.pdf", 2),
],
ids=["iss1370", "iss1379"],
)
def test_text_extraction_of_specific_pages(url: str, name: str, page_nb: int):
def test_text_extraction_of_specific_pages(
url: str, name: str, page_nb: int, within_text
):
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader.pages[page_nb].extract_text()
assert within_text in reader.pages[page_nb].extract_text()


@pytest.mark.enable_socket()
Expand All @@ -135,17 +143,28 @@ def test_iss1533():


@pytest.mark.enable_socket()
def test_ucs2_gbk(caplog):
url = "https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf"
name = "tstUCS2.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader.pages[1].extract_text() # no error
assert caplog.text == ""
# iss 1809
url = "https://github.com/py-pdf/pypdf/files/11315397/3.pdf"
name = "tst-GBK_EUC.pdf"
@pytest.mark.parametrize(
("url", "name", "page_index", "within_text", "caplog_text"),
[
(
"https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf",
"tstUCS2.pdf",
1,
["2 / 12", "S0490520090001", "于博"],
"",
),
(
"https://github.com/py-pdf/pypdf/files/11315397/3.pdf",
"tst-GBK_EUC.pdf",
0,
["NJA", "中华男科学杂志"],
"Multiple definitions in dictionary at byte 0x5cb42 for key /MediaBox\n",
),
],
)
def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text):
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
t = reader.pages[0].extract_text()
assert "NJA" in t
assert "中华男科学杂志" in t
# assert caplog.text == "" a duplicate field confirmed in page 0, so no check of caplog
extracted = reader.pages[page_index].extract_text() # no error
for contained in within_text:
assert contained in extracted
assert caplog_text in caplog.text

0 comments on commit a04b65b

Please sign in to comment.