From 641804fe85f3e6d66e5f77998b6e2d6fb0f0031e Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 14 Jun 2022 22:29:15 +0200 Subject: [PATCH 1/4] ROB : utf-16-be' codec can't decode (...) #988 the data bytes are not matching encoding expectation --- PyPDF2/_page.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 031aa1471..92886a95a 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1218,7 +1218,13 @@ def process_operation(operator: bytes, operands: List) -> None: else operands[0] ) if isinstance(cmap[0], str): - t = tt.decode(cmap[0], "surrogatepass") # apply str encoding + try: + t = tt.decode(cmap[0], "surrogatepass") # apply str encoding + except: # the data does not match the expectation, we use the alternative ; text extraction may not be good + t = tt.decode( + "utf-16-be" if cmap[0] == "charmap" else "charmap", + "surrogatepass", + ) # apply str encoding else: # apply dict encoding t = "".join( [ From 7eaf7543d483f73145c13a3bf82ab0214f51f275 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 14 Jun 2022 22:39:47 +0200 Subject: [PATCH 2/4] flake8 --- PyPDF2/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 92886a95a..bd24fbe28 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1220,7 +1220,7 @@ def process_operation(operator: bytes, operands: List) -> None: if isinstance(cmap[0], str): try: t = tt.decode(cmap[0], "surrogatepass") # apply str encoding - except: # the data does not match the expectation, we use the alternative ; text extraction may not be good + except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good t = tt.decode( "utf-16-be" if cmap[0] == "charmap" else "charmap", "surrogatepass", From e4d8d68c9c953c477b77db0f91d5cb8b83ed38fe Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 14 Jun 2022 22:40:41 +0200 Subject: [PATCH 3/4] fix assertion error #960 --- PyPDF2/_cmap.py | 1 - 1 file changed, 1 deletion(-) diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py index bb9c477a1..430f3d68f 100644 --- a/PyPDF2/_cmap.py +++ b/PyPDF2/_cmap.py @@ -222,7 +222,6 @@ def parse_to_unicode( ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") int_entry.append(a) a += 1 - assert a > b else: c = int(lst[2], 16) fmt2 = b"%%0%dX" % len(lst[2]) From 6a33f1109d5fbdbdeb4a88baf595ceb38f2f4bb8 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 15 Jun 2022 20:10:15 +0200 Subject: [PATCH 4/4] extend test tika --- tests/__init__.py | 2 ++ tests/test_workflows.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/tests/__init__.py b/tests/__init__.py index 637a97714..6aee556d8 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,4 +1,5 @@ import os +import ssl import urllib.request @@ -21,6 +22,7 @@ def get_pdf_from_url(url: str, name: str) -> bytes: os.mkdir(cache_dir) cache_path = os.path.join(cache_dir, name) if not os.path.exists(cache_path): + ssl._create_default_https_context = ssl._create_unverified_context with urllib.request.urlopen(url) as response, open( cache_path, "wb" ) as out_file: diff --git a/tests/test_workflows.py b/tests/test_workflows.py index b928b858a..c5f8ebf20 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -162,6 +162,11 @@ def test_rotate_45(): [0, 1, 34, 35, 36, 118, 119, 120, 121], ), (True, "https://github.com/py-pdf/PyPDF2/files/8884493/998167.pdf", [0]), + ( + True, + "https://corpora.tika.apache.org/base/docs/govdocs1/971/971703.pdf", + [0, 1, 5, 8, 14], + ), ], ) def test_extract_textbench(enable, url, pages, print_result=False):