Skip to content

Commit

Permalink
fix odd space
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz committed Jul 10, 2022
1 parent fa7d8fe commit 758738d
Showing 1 changed file with 25 additions and 22 deletions.
47 changes: 25 additions & 22 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1275,29 +1275,32 @@ def process_operation(operator: bytes, operands: List) -> None:

elif operator == b"Tj":
check_crlf_space = True
t: str = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)
if isinstance(cmap[0], str):
try:
t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[
cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
for x in tt
]
if isinstance(operands[0], str):
text += operands[0]
else:
t: str = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)

text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
if isinstance(cmap[0], str):
try:
t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[
cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
for x in tt
]
)

text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
else:
return None
if check_crlf_space:
Expand Down

0 comments on commit 758738d

Please sign in to comment.