diff --git a/pypdf/_page.py b/pypdf/_page.py index 32de40299..80d98287b 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1826,10 +1826,12 @@ def process_operation(operator: bytes, operands: List) -> None: ] ) # "\u0590 - \u08FF \uFB50 - \uFDFF" - for x in "".join( - [cmap[1][x] if x in cmap[1] else x for x in t] - ): - xx = ord(x) + for x in [cmap[1][x] if x in cmap[1] else x for x in t]: + # x can be a sequence of bytes ; ex: habibi.pdf + if len(x) == 1: + xx = ord(x) + else: + xx = 1 # fmt: off if ( # cases where the current inserting order is kept diff --git a/tests/test_page.py b/tests/test_page.py index 47919286a..aa882ea68 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -877,10 +877,14 @@ def test_empyt_password_1088(): len(reader.pages) -@pytest.mark.xfail(reason="#1088 / #1126") -def test_arab_text_extraction(): +@pytest.mark.external +def test_old_habibi(): + # this habibi has som multiple characters associated with the h reader = PdfReader(SAMPLE_ROOT / "015-arabic/habibi.pdf") - assert reader.pages[0].extract_text() == "habibi حَبيبي" + txt = reader.pages[0].extract_text() # very odd file + assert ( + "habibi" in txt and "حَبيبي" in txt + ) # extract from acrobat reader "حَبيبي habibi􀀃􀏲􀎒􀏴􀎒􀎣􀋴 @pytest.mark.samples @@ -1016,7 +1020,7 @@ def test_merge_resources(apage1, apage2, expected_result, expected_renames): # Assert assert result == expected_result - assert renames == expected_renames + assert renames == expected_renames def test_merge_page_resources_smoke_test():