diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 031aa1471..92886a95a 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1218,7 +1218,13 @@ def process_operation(operator: bytes, operands: List) -> None: else operands[0] ) if isinstance(cmap[0], str): - t = tt.decode(cmap[0], "surrogatepass") # apply str encoding + try: + t = tt.decode(cmap[0], "surrogatepass") # apply str encoding + except: # the data does not match the expectation, we use the alternative ; text extraction may not be good + t = tt.decode( + "utf-16-be" if cmap[0] == "charmap" else "charmap", + "surrogatepass", + ) # apply str encoding else: # apply dict encoding t = "".join( [