ROB : utf-16-be' codec can't decode (...) py-pdf#988

the data bytes are not matching encoding expectation
pubpub-zz · Jun 14, 2022 · 641804f · 641804f
1 parent 712c16d
commit 641804f
Showing 1 changed file with 7 additions and 1 deletion.
diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py
@@ -1218,7 +1218,13 @@ def process_operation(operator: bytes, operands: List) -> None:
                     else operands[0]
                 )
                 if isinstance(cmap[0], str):
-                    t = tt.decode(cmap[0], "surrogatepass")  # apply str encoding
+                    try:
+                        t = tt.decode(cmap[0], "surrogatepass")  # apply str encoding
+                    except:  # the data does not match the expectation, we use the alternative ; text extraction may not be good
+                        t = tt.decode(
+                            "utf-16-be" if cmap[0] == "charmap" else "charmap",
+                            "surrogatepass",
+                        )  # apply str encoding
                 else:  # apply dict encoding
                     t = "".join(
                         [