Skip to content

Commit

Permalink
BUG: Cope with encoding with too many differences (#2873)
Browse files Browse the repository at this point in the history
Closes #2836.
  • Loading branch information
pubpub-zz authored Sep 26, 2024
1 parent dcd15aa commit 3b89062
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 1 deletion.
3 changes: 2 additions & 1 deletion pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,8 @@ def parse_encoding(
x = o
else: # isinstance(o,str):
try:
encoding[x] = adobe_glyphs[o] # type: ignore
if x < len(encoding):
encoding[x] = adobe_glyphs[o] # type: ignore
except Exception:
encoding[x] = o # type: ignore
if o == " ":
Expand Down
11 changes: 11 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,14 @@ def test_unigb_utf16():
name = "iss2812.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "《中国能源展望 2060(2024 年版)》编写委员会" in reader.pages[1].extract_text()


@pytest.mark.enable_socket()
def test_too_many_differences():
"""Cf #2836"""
url = (
"https://github.com/user-attachments/files/16911741/dumb_extract_text_crash.pdf"
)
name = "iss2836.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert reader.pages[0].extract_text() == ""

0 comments on commit 3b89062

Please sign in to comment.