From 0b693e1122d568f29f266340121915b3813eb8c2 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 17 Jul 2022 20:41:45 +0200 Subject: [PATCH] TST: Add test for arab text (#1127) --- sample-files | 2 +- tests/test_page.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/sample-files b/sample-files index 31763905b..200644f72 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit 31763905b4a06014cbd23d2e03b7b5616661fed5 +Subproject commit 200644f7219811c3930ad1732ef70c570ece2d16 diff --git a/tests/test_page.py b/tests/test_page.py index aa6539d94..d6e35e184 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -2,6 +2,7 @@ import os from copy import deepcopy from io import BytesIO +from pathlib import Path import pytest @@ -16,11 +17,11 @@ TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "resources") -EXTERNAL_ROOT = os.path.join(PROJECT_ROOT, "sample-files") +EXTERNAL_ROOT = Path(PROJECT_ROOT) / "sample-files" def get_all_sample_files(): - with open(os.path.join(EXTERNAL_ROOT, "files.json")) as fp: + with open(EXTERNAL_ROOT / "files.json") as fp: data = fp.read() meta = json.loads(data) return meta @@ -37,7 +38,7 @@ def get_all_sample_files(): ) @pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning") def test_read(meta): - pdf_path = os.path.join(EXTERNAL_ROOT, meta["path"]) + pdf_path = EXTERNAL_ROOT / meta["path"] reader = PdfReader(pdf_path) reader.pages[0] assert len(reader.pages) == meta["pages"] @@ -342,3 +343,9 @@ def test_empyt_password_1088(): stream = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(stream) len(reader.pages) + + +@pytest.mark.xfail(reason="#1088 / #1126") +def test_arab_text_extraction(): + reader = PdfReader(EXTERNAL_ROOT / "015-arabic/habibi.pdf") + assert reader.pages[0].extract_text() == "habibi حَبيبي"