ROB : utf-16-be' codec can't decode (...) (#995)

Closes #988
py-pdf · Jun 15, 2022 · 034d7a9 · 034d7a9
1 parent e292822
commit 034d7a9
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 2 deletions.
diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py
@@ -222,7 +222,6 @@ def parse_to_unicode(
                     ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
                     int_entry.append(a)
                     a += 1
-                    assert a > b
             else:
                 c = int(lst[2], 16)
                 fmt2 = b"%%0%dX" % len(lst[2])

diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py
@@ -1218,7 +1218,13 @@ def process_operation(operator: bytes, operands: List) -> None:
                     else operands[0]
                 )
                 if isinstance(cmap[0], str):
-                    t = tt.decode(cmap[0], "surrogatepass")  # apply str encoding
+                    try:
+                        t = tt.decode(cmap[0], "surrogatepass")  # apply str encoding
+                    except Exception:  # the data does not match the expectation, we use the alternative ; text extraction may not be good
+                        t = tt.decode(
+                            "utf-16-be" if cmap[0] == "charmap" else "charmap",
+                            "surrogatepass",
+                        )  # apply str encoding
                 else:  # apply dict encoding
                     t = "".join(
                         [

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,4 +1,5 @@
 import os
+import ssl
 import urllib.request
 
 
@@ -21,6 +22,7 @@ def get_pdf_from_url(url: str, name: str) -> bytes:
         os.mkdir(cache_dir)
     cache_path = os.path.join(cache_dir, name)
     if not os.path.exists(cache_path):
+        ssl._create_default_https_context = ssl._create_unverified_context
         with urllib.request.urlopen(url) as response, open(
             cache_path, "wb"
         ) as out_file:

diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -162,6 +162,11 @@ def test_rotate_45():
             [0, 1, 34, 35, 36, 118, 119, 120, 121],
         ),
         (True, "https://github.com/py-pdf/PyPDF2/files/8884493/998167.pdf", [0]),
+        (
+            True,
+            "https://corpora.tika.apache.org/base/docs/govdocs1/971/971703.pdf",
+            [0, 1, 5, 8, 14],
+        ),
     ],
 )
 def test_extract_textbench(enable, url, pages, print_result=False):