From 295728b4ec5ff4de4a8fb16257c28c485a8fc2b9 Mon Sep 17 00:00:00 2001 From: rchen19 Date: Tue, 5 Sep 2023 14:45:53 -0700 Subject: [PATCH 1/3] BUG: catch the case where w[0] is an `IndirectObject` instead of an int --- pypdf/_cmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 369ab1904..7c8a56e29 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -409,7 +409,7 @@ def compute_space_width( else: w = [] while len(w) > 0: - st = w[0] + st = w[0] if isinstance(w[0], int) else w[0].get_object() second = w[1].get_object() if isinstance(second, int): for x in range(st, second): From f92244e2c6276878b9605ce06645ab532505950e Mon Sep 17 00:00:00 2001 From: rchen19 Date: Wed, 6 Sep 2023 14:53:05 -0700 Subject: [PATCH 2/3] BUG: add test for bug fix for issue #2137 - a pdf file from arxiv is included --- tests/test_cmap.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_cmap.py b/tests/test_cmap.py index fe769e1c5..7aab01d47 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -189,3 +189,15 @@ def test_unixxx_glyphs(): txt = reader.pages[0].extract_text() # no error for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"): assert pat in txt + + +@pytest.mark.enable_socket() +def test_cmap_compute_space_width(): + # issue 2137 + # original file URL: + # url = "https://arxiv.org/pdf/2005.05909.pdf" + url = "https://github.com/py-pdf/pypdf/files/12489914/Morris.et.al.-.2020.-.TextAttack.A.Framework.for.Adversarial.Attacks.Data.Augmentation.and.Adversarial.Training.in.NLP.pdf" + name = "Morris et al. - 2020 - TextAttack A Framework for Adversarial Attacks, Data Augmentation, and Adversarial Training in NLP.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + txt = reader.pages[0].extract_text() # no error + From 6d9f1fefc26aa5f4f9c5a7a71553a995c349b1ba Mon Sep 17 00:00:00 2001 From: rchen19 Date: Wed, 6 Sep 2023 15:58:31 -0700 Subject: [PATCH 3/3] BUG: fix code stype errors in test for issue #2137 - URL too long - file name too long - variable declared but not used --- tests/test_cmap.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 7aab01d47..922bfae58 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -195,9 +195,10 @@ def test_unixxx_glyphs(): def test_cmap_compute_space_width(): # issue 2137 # original file URL: - # url = "https://arxiv.org/pdf/2005.05909.pdf" - url = "https://github.com/py-pdf/pypdf/files/12489914/Morris.et.al.-.2020.-.TextAttack.A.Framework.for.Adversarial.Attacks.Data.Augmentation.and.Adversarial.Training.in.NLP.pdf" - name = "Morris et al. - 2020 - TextAttack A Framework for Adversarial Attacks, Data Augmentation, and Adversarial Training in NLP.pdf" + url = "https://arxiv.org/pdf/2005.05909.pdf" + # URL from github issue is too long to pass code stype check, use original arxiv URL instead + # url = "https://github.com/py-pdf/pypdf/files/12489914/Morris.et.al.-.2020.-.TextAttack.A.Framework.for.Adversarial.Attacks.Data.Augmentation.and.Adversarial.Training.in.NLP.pdf" + name = "TextAttack_paper.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - txt = reader.pages[0].extract_text() # no error + reader.pages[0].extract_text() # no error