Skip to content

Commit

Permalink
ROB : utf-16-be' codec can't decode (...) (#995)
Browse files Browse the repository at this point in the history
Closes #988
  • Loading branch information
pubpub-zz authored Jun 15, 2022
1 parent e292822 commit 034d7a9
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 2 deletions.
1 change: 0 additions & 1 deletion PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,6 @@ def parse_to_unicode(
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
assert a > b
else:
c = int(lst[2], 16)
fmt2 = b"%%0%dX" % len(lst[2])
Expand Down
8 changes: 7 additions & 1 deletion PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1218,7 +1218,13 @@ def process_operation(operator: bytes, operands: List) -> None:
else operands[0]
)
if isinstance(cmap[0], str):
t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
try:
t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[
Expand Down
2 changes: 2 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import ssl
import urllib.request


Expand All @@ -21,6 +22,7 @@ def get_pdf_from_url(url: str, name: str) -> bytes:
os.mkdir(cache_dir)
cache_path = os.path.join(cache_dir, name)
if not os.path.exists(cache_path):
ssl._create_default_https_context = ssl._create_unverified_context
with urllib.request.urlopen(url) as response, open(
cache_path, "wb"
) as out_file:
Expand Down
5 changes: 5 additions & 0 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,11 @@ def test_rotate_45():
[0, 1, 34, 35, 36, 118, 119, 120, 121],
),
(True, "https://github.com/py-pdf/PyPDF2/files/8884493/998167.pdf", [0]),
(
True,
"https://corpora.tika.apache.org/base/docs/govdocs1/971/971703.pdf",
[0, 1, 5, 8, 14],
),
],
)
def test_extract_textbench(enable, url, pages, print_result=False):
Expand Down

0 comments on commit 034d7a9

Please sign in to comment.