Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Extract Text Enhancement (whitespaces) #1084

Merged
merged 11 commits into from
Jul 13, 2022
53 changes: 35 additions & 18 deletions PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,17 @@ def build_char_map(
for x in int_entry:
if x <= 255:
encoding[x] = chr(x)
if font_name in _default_fonts_space_width:
try:
# override space_width with new params
space_width = _default_fonts_space_width[font_name]
sp_width = compute_space_width(ft, space_code, space_width)
space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
except Exception:
pass
# I conside the space_code is available on one byte
if isinstance(space_code, str):
sp = space_code.encode("charmap")[0]
else:
sp = space_code
sp_width = compute_space_width(ft, sp, space_width)

return (
font_type,
Expand Down Expand Up @@ -193,7 +200,7 @@ def parse_to_unicode(
)

for l in cm.split(b"\n"):
if l in (b"", b" "):
if l in (b"", b" ") or l[0] == 37: # 37 = %
continue
if b"beginbfrange" in l:
process_rg = True
Expand Down Expand Up @@ -224,7 +231,7 @@ def parse_to_unicode(
a += 1
else:
c = int(lst[2], 16)
fmt2 = b"%%0%dX" % len(lst[2])
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
while a <= b:
map_dict[
unhexlify(fmt % a).decode(
Expand Down Expand Up @@ -259,30 +266,40 @@ def compute_space_width(
) -> float:
sp_width: float = space_width * 2 # default value
w = []
w1 = {}
st: int = 0
if "/W" in ft:
if "/DW" in ft:
sp_width = cast(float, ft["/DW"])
w = list(ft["/W"]) # type: ignore
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
try:
w1[-1] = cast(float, ft1["/DW"])
except Exception:
w1[-1] = 1000.0
w = list(ft1["/W"]) # type: ignore
while len(w) > 0:
st = w[0]
second = w[1]
if isinstance(int, second):
if st <= space_code and space_code <= second:
sp_width = w[2]
break
if isinstance(second, int):
for x in range(st, second):
w1[x] = w[2]
w = w[3:]
if isinstance(list, second):
if st <= space_code and space_code <= st + len(second) - 1:
sp_width = second[space_code - st]
elif isinstance(second, list):
for y in second:
w1[st] = y
st += 1
w = w[2:]
else:
warnings.warn(
"unknown widths : \n" + (ft["/W"]).__repr__(),
"unknown widths : \n" + (ft1["/W"]).__repr__(),
PdfReadWarning,
)
break
if "/Widths" in ft:
try:
sp_width = w1[space_code]
except Exception:
sp_width = (
w1[-1] / 2.0
) # if using default we consider space will be only half size
elif "/Widths" in ft:
w = list(ft["/Widths"]) # type: ignore
try:
st = cast(int, ft["/FirstChar"])
Expand Down
207 changes: 158 additions & 49 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1143,22 +1143,53 @@ def _extract_text(
# are strings where the byte->string encoding was unknown, so adding
# them to the text here would be gibberish.

cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
cm_stack = []
tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
tm_prev: List[float] = [
1.0,
0.0,
0.0,
1.0,
0.0,
0.0,
] # will store cm_matrix * tm_matrix
char_scale = 1.0
space_scale = 1.0
_space_width: float = 500.0 # will be set correctly at first Tf
TL = 0.0
font_size = 12.0 # init just in case of

# tm_matrix: Tuple = tm_matrix, output: str = output, text: str = text,
# char_scale: float = char_scale,space_scale : float = space_scale, _space_width: float = _space_width,
# TL: float = TL, font_size: float = font_size, cmap = cmap
def sign(x: float) -> float:
return 1 if x >= 0 else -1

def mult(m: List[float], n: List[float]) -> List[float]:
return [
m[0] * n[0] + m[1] * n[2],
m[0] * n[1] + m[1] * n[3],
m[2] * n[0] + m[3] * n[2],
m[2] * n[1] + m[3] * n[3],
m[4] * n[0] + m[5] * n[2] + n[4],
m[4] * n[1] + m[5] * n[3] + n[5],
]

def orient(m: List[float]) -> int:
if m[3] > 1e-6:
return 0
elif m[3] < -1e-6:
return 180
elif m[1] > 0:
return 90
else:
return 270

def current_spacewidth() -> float:
# return space_scale * _space_width * char_scale
return _space_width / 1000.0

def process_operation(operator: bytes, operands: List) -> None:
nonlocal tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap
if tm_matrix[4] != 0 and tm_matrix[5] != 0: # o reuse of the
tm_prev = list(tm_matrix)
nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap
check_crlf_space: bool = False
# Table 5.4 page 405
if operator == b"BT":
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
Expand All @@ -1172,6 +1203,29 @@ def process_operation(operator: bytes, operands: List) -> None:
elif operator == b"ET":
output += text
text = ""
# table 4.7, page 219
# cm_matrix calculation is a reserved for the moment
elif operator == b"q":
cm_stack.append(cm_matrix)
elif operator == b"Q":
try:
cm_matrix = cm_stack.pop()
except Exception:
cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
elif operator == b"cm":
output += text
text = ""
cm_matrix = mult(
[
float(operands[0]),
float(operands[1]),
float(operands[2]),
float(operands[3]),
float(operands[4]),
float(operands[5]),
],
cm_matrix,
)
# Table 5.2 page 398
elif operator == b"Tz":
char_scale = float(operands[0]) / 100.0
Expand Down Expand Up @@ -1203,9 +1257,11 @@ def process_operation(operator: bytes, operands: List) -> None:
pass # keep previous size
# Table 5.5 page 406
elif operator == b"Td":
tm_matrix[5] += float(operands[1])
check_crlf_space = True
tm_matrix[4] += float(operands[0])
tm_matrix[5] += float(operands[1])
elif operator == b"Tm":
check_crlf_space = True
tm_matrix = [
float(operands[0]),
float(operands[1]),
Expand All @@ -1215,56 +1271,101 @@ def process_operation(operator: bytes, operands: List) -> None:
float(operands[5]),
]
elif operator == b"T*":
check_crlf_space = True
tm_matrix[5] -= TL

elif operator == b"Tj":
t: str = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)
if isinstance(cmap[0], str):
try:
t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[
cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
for x in tt
]
check_crlf_space = True
if isinstance(operands[0], str):
text += operands[0]
else:
t: str = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)
Comment on lines +1279 to 1287
Copy link

@biredel biredel May 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

second isinstance(operands[0], str) branch looks unreachable here (since moved over here)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. Do you want to submit a corresponding PR?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stefan6419846 At least not immediately. Clearly there was or is something to that code that I do not understand enough to just delete it from a version that no longer contains the explanation - #2440 was merged much later!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@biredel
I'm confused :
you are looking at an obsolete branch (PyPDF2) instead of pypdf.
The code seems to be this one
https://github.com/py-pdf/pypdf/blob/a435eaaa08c71e3f66320edd06be24637ef32986/pypdf/_text_extraction/__init__.py#L225C18-L234C18

Codecov indicates some test coverage.


text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
if isinstance(cmap[0], str):
try:
t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[
cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
for x in tt
]
)

text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
else:
return None
# process text changes due to positionchange: " "
if tm_matrix[5] <= (
tm_prev[5]
- font_size # remove scaling * sqrt(tm_matrix[2] ** 2 + tm_matrix[3] ** 2)
): # it means that we are moving down by one line
output += text + "\n" # .translate(cmap) + "\n"
text = ""
elif tm_matrix[4] >= (
tm_prev[4] + space_scale * _space_width * char_scale
): # it means that we are moving down by one line
text += " "
return None
# for clarity Operator in (b"g",b"G") : nothing to do
# end of process_operation ######
if check_crlf_space:
m = mult(tm_matrix, cm_matrix)
o = orient(m)
deltaX = m[4] - tm_prev[4]
deltaY = m[5] - tm_prev[5]
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
f = font_size * k
tm_prev = m
try:
if o == 0:
if deltaY < -0.8 * f:
if (output + text)[-1] != "\n":
text += "\n"
elif (
abs(deltaY) < f * 0.3
and abs(deltaX) > current_spacewidth() * f * 10
):
if (output + text)[-1] != " ":
text += " "
elif o == 180:
if deltaY > 0.8 * f:
if (output + text)[-1] != "\n":
text += "\n"
elif (
abs(deltaY) < f * 0.3
and abs(deltaX) > current_spacewidth() * f * 10
):
if (output + text)[-1] != " ":
text += " "
elif o == 90:
if deltaX > 0.8 * f:
if (output + text)[-1] != "\n":
text += "\n"
elif (
abs(deltaX) < f * 0.3
and abs(deltaY) > current_spacewidth() * f * 10
):
if (output + text)[-1] != " ":
text += " "
elif o == 270:
if deltaX < -0.8 * f:
if (output + text)[-1] != "\n":
text += "\n"
elif (
abs(deltaX) < f * 0.3
and abs(deltaY) > current_spacewidth() * f * 10
):
if (output + text)[-1] != " ":
text += " "
except Exception:
pass

for operands, operator in content.operations:
# multiple operators are defined in here ####
if operator == b"'":
process_operation(b"T*", [])
process_operation(b"Tj", operands)
elif operator == b'"':
process_operation(b"Tw", [operands[0]])
process_operation(b"Tc", [operands[1]])
process_operation(b"T*", [])
process_operation(b"TJ", operands)
process_operation(b"Tj", operands[2:])
elif operator == b"TD":
process_operation(b"TL", [-operands[1]])
process_operation(b"Td", operands)
Expand All @@ -1273,15 +1374,23 @@ def process_operation(operator: bytes, operands: List) -> None:
if isinstance(op, (str, bytes)):
process_operation(b"Tj", [op])
if isinstance(op, (int, float, NumberObject, FloatObject)):
process_operation(b"Td", [-op, 0.0])
if (
(abs(float(op)) >= _space_width)
and (abs(float(op)) <= 8 * _space_width)
and (text[-1] != " ")
):
process_operation(b"Tj", [" "])
elif operator == b"Do":
output += text
if output != "":
output += "\n"
try:
if output[-1] != "\n":
output += "\n"
except IndexError:
pass
try:
xobj = resources_dict["/XObject"] # type: ignore
if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore
output += text
# output += text
text = self.extract_xform_text(xobj[operands[0]], space_width) # type: ignore
output += text
except Exception:
Expand Down
1 change: 1 addition & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def get_all_sample_files():
[m for m in all_files_meta["data"] if not m["encrypted"]],
ids=[m["path"] for m in all_files_meta["data"] if not m["encrypted"]],
)
@pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning")
def test_read(meta):
pdf_path = os.path.join(EXTERNAL_ROOT, meta["path"])
reader = PdfReader(pdf_path)
Expand Down