From a04b65b53519ffd0c2001c4006f40212a2f15d99 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 1 May 2023 09:06:02 +0200 Subject: [PATCH] TST: Parametrize test_cmap_encodings (#1823) --- tests/test_cmap.py | 53 +++++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 6ffdf2da0..ce91fd23c 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -109,20 +109,28 @@ def test_ascii_charset(): @pytest.mark.enable_socket() @pytest.mark.parametrize( - ("url", "name", "page_nb"), + ("url", "name", "page_nb", "within_text"), [ ( "https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf", "cmap1370.pdf", 0, + "", + ), + ( + "https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf", + "02voc.pdf", + 2, + "Document delineation and character sequence decoding", ), - ("https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf", "02voc.pdf", 2), ], ids=["iss1370", "iss1379"], ) -def test_text_extraction_of_specific_pages(url: str, name: str, page_nb: int): +def test_text_extraction_of_specific_pages( + url: str, name: str, page_nb: int, within_text +): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - reader.pages[page_nb].extract_text() + assert within_text in reader.pages[page_nb].extract_text() @pytest.mark.enable_socket() @@ -135,17 +143,28 @@ def test_iss1533(): @pytest.mark.enable_socket() -def test_ucs2_gbk(caplog): - url = "https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf" - name = "tstUCS2.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - reader.pages[1].extract_text() # no error - assert caplog.text == "" - # iss 1809 - url = "https://github.com/py-pdf/pypdf/files/11315397/3.pdf" - name = "tst-GBK_EUC.pdf" +@pytest.mark.parametrize( + ("url", "name", "page_index", "within_text", "caplog_text"), + [ + ( + "https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf", + "tstUCS2.pdf", + 1, + ["2 / 12", "S0490520090001", "于博"], + "", + ), + ( + "https://github.com/py-pdf/pypdf/files/11315397/3.pdf", + "tst-GBK_EUC.pdf", + 0, + ["NJA", "中华男科学杂志"], + "Multiple definitions in dictionary at byte 0x5cb42 for key /MediaBox\n", + ), + ], +) +def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - t = reader.pages[0].extract_text() - assert "NJA" in t - assert "中华男科学杂志" in t - # assert caplog.text == "" a duplicate field confirmed in page 0, so no check of caplog + extracted = reader.pages[page_index].extract_text() # no error + for contained in within_text: + assert contained in extracted + assert caplog_text in caplog.text