Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coords in extract text #1389

Closed
wants to merge 8 commits into from
155 changes: 123 additions & 32 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1310,14 +1310,42 @@ def _extract_text(
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
group_TJ: bool = True,
) -> str:
"""
See extract_text for most arguments.

Args:
content_key: indicate the default key where to extract data
None = the object; this allow to reuse the function on XObject
default = "/Content"
Arabic, Hebrew,... are extracted in the good order. If required an custom RTL range of characters
can be defined; see function set_custom_rtl

Additionally you can provide visitor-methods to get informed on all operands and all text-objects.
For example in some PDF files this can be useful to parse tables.

:param Tuple[int, ...] orientations: list of orientations text_extraction will look for
default = (0, 90, 180, 270)
note: currently only 0(Up),90(turned Left), 180(upside Down), 270 (turned Right)
:param float space_width: force default space width
(if not extracted from font (default 200)
:param Optional[str] content_key: indicate the default key where to extract data
None = the object; this allow to reuse the function on XObject
default = "/Content"
:param Optional[Function] visitor_operand_before: function to be called before processing an operand.
It has four arguments: operand, operand-arguments,
current transformation matrix and text matrix.
:param Optional[Function] visitor_operand_after: function to be called after processing an operand.
It has four arguments: operand, operand-arguments,
current transformation matrix and text matrix.
:param Optional[Function] visitor_text: function to be called when extracting some text at some position.
It has five arguments: text,
current transformation matrix, text matrix, font-dictionary and font-size.
The font-dictionary may be None in case of unknown fonts.
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
:param Optional[bool] group_TJ: True for one call of visitor_text at each TJ,
False for calls of visitor_text at each text-fragment of TJ.
:param content_key: indicate the default key where to extract data
None = the object; this allow to reuse the function on XObject
default = "/Content"
:return: a string object.
"""
text: str = ""
output: str = ""
Expand Down Expand Up @@ -1411,17 +1439,13 @@ def process_operation(operator: bytes, operands: List) -> None:
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
# tm_prev = tm_matrix
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
# based
# if output != "" and output[-1]!="\n":
# output += "\n"
text = ""
return None
elif operator == b"ET":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
# table 4.7 "Graphics state operators", page 219
# cm_matrix calculation is a reserved for the moment
Expand Down Expand Up @@ -1453,8 +1477,6 @@ def process_operation(operator: bytes, operands: List) -> None:
# rtl_dir = False
elif operator == b"cm":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
cm_matrix = mult(
[
Expand All @@ -1478,8 +1500,6 @@ def process_operation(operator: bytes, operands: List) -> None:
elif operator == b"Tf":
if text != "":
output += text # .translate(cmap)
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
# rtl_dir = False
try:
Expand Down Expand Up @@ -1537,6 +1557,10 @@ def process_operation(operator: bytes, operands: List) -> None:
if orientation in orientations:
if isinstance(operands[0], str):
text += operands[0]
if visitor_text is not None:
visitor_text(
operands[0], cm_matrix, tm_matrix, cmap[3], font_size
)
else:
t: str = ""
tt: bytes = (
Expand All @@ -1562,6 +1586,7 @@ def process_operation(operator: bytes, operands: List) -> None:
]
)
# "\u0590 - \u08FF \uFB50 - \uFDFF"
tj_text = ""
for x in "".join(
[cmap[1][x] if x in cmap[1] else x for x in t]
):
Expand All @@ -1574,7 +1599,7 @@ def process_operation(operator: bytes, operands: List) -> None:
or (0x20A0 <= xx and xx <= 0x21FF) # but (numbers) indices/exponents
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
):
text = x + text if rtl_dir else text + x
tj_text = x + tj_text if rtl_dir else tj_text + x
elif ( # right-to-left characters set
(0x0590 <= xx and xx <= 0x08FF)
or (0xFB1D <= xx and xx <= 0xFDFF)
Expand All @@ -1586,21 +1611,22 @@ def process_operation(operator: bytes, operands: List) -> None:
rtl_dir = True
# print("RTL",text,"*")
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
text = x + text
tj_text = x + tj_text
else: # left-to-right
# print(">",xx,x,end="")
if rtl_dir:
rtl_dir = False
# print("LTR",text,"*")
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
text = text + x
tj_text = tj_text + x
# fmt: on
text = tj_text + text if rtl_dir else text + tj_text
if visitor_text is not None:
visitor_text(
tj_text, cm_matrix, tm_matrix, cmap[3], font_size
)
else:
return None
if check_crlf_space:
Expand All @@ -1620,7 +1646,7 @@ def process_operation(operator: bytes, operands: List) -> None:
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
"\n",
cm_matrix,
tm_matrix,
cmap[3],
Expand All @@ -1633,13 +1659,21 @@ def process_operation(operator: bytes, operands: List) -> None:
):
if (output + text)[-1] != " ":
text += " "
if visitor_text is not None:
visitor_text(
" ",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
elif orientation == 180:
if delta_y > 0.8 * f:
if deltaY > 0.8 * f:
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
"\n",
cm_matrix,
tm_matrix,
cmap[3],
Expand All @@ -1652,13 +1686,21 @@ def process_operation(operator: bytes, operands: List) -> None:
):
if (output + text)[-1] != " ":
text += " "
if visitor_text is not None:
visitor_text(
" ",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
elif orientation == 90:
if delta_x > 0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
"\n",
cm_matrix,
tm_matrix,
cmap[3],
Expand All @@ -1671,13 +1713,21 @@ def process_operation(operator: bytes, operands: List) -> None:
):
if (output + text)[-1] != " ":
text += " "
if visitor_text is not None:
visitor_text(
" ",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
elif orientation == 270:
if delta_x < -0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
"\n",
cm_matrix,
tm_matrix,
cmap[3],
Expand All @@ -1690,6 +1740,14 @@ def process_operation(operator: bytes, operands: List) -> None:
):
if (output + text)[-1] != " ":
text += " "
if visitor_text is not None:
visitor_text(
" ",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
except Exception:
pass

Expand All @@ -1709,6 +1767,28 @@ def process_operation(operator: bytes, operands: List) -> None:
process_operation(b"TL", [-operands[1]])
process_operation(b"Td", operands)
elif operator == b"TJ":
if visitor_text is not None and group_TJ:
# To prevent sending letters instead of words we
# override the visitor temporarily.
visitor_text_before = visitor_text
tm_matrix_before = [
tm_matrix[0],
tm_matrix[1],
tm_matrix[2],
tm_matrix[3],
tm_matrix[4],
tm_matrix[5],
]
text_TJ = []

def visitor_text(text, cm_matrix, tm_matrix, font_dict, font_size):
# TODO cases where the current inserting order is kept
if rtl_dir:
# right-to-left
text_TJ.insert(0, text)
else:
text_TJ.append(text)

for op in operands[0]:
if isinstance(op, (str, bytes)):
process_operation(b"Tj", [op])
Expand All @@ -1719,10 +1799,17 @@ def process_operation(operator: bytes, operands: List) -> None:
and (text[-1] != " ")
):
process_operation(b"Tj", [" "])
if visitor_text is not None and group_TJ:
visitor_text = visitor_text_before
visitor_text(
"".join(text_TJ),
cm_matrix,
tm_matrix_before,
cmap[3],
font_size,
)
elif operator == b"Do":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
try:
if output[-1] != "\n":
output += "\n"
Expand All @@ -1741,10 +1828,9 @@ def process_operation(operator: bytes, operands: List) -> None:
visitor_operand_before,
visitor_operand_after,
visitor_text,
group_TJ,
)
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
except Exception:
logger_warning(
f" impossible to decode XFormObject {operands[0]}",
Expand All @@ -1757,8 +1843,6 @@ def process_operation(operator: bytes, operands: List) -> None:
if visitor_operand_after is not None:
visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
output += text # just in case of
if text != "" and visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
return output

def extract_text(
Expand All @@ -1771,6 +1855,7 @@ def extract_text(
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
group_TJ: bool = True,
) -> str:
"""
Locate all text drawing commands, in the order they are provided in the
Expand Down Expand Up @@ -1804,10 +1889,12 @@ def extract_text(
It has four arguments: operand, operand-arguments,
current transformation matrix and text matrix.
visitor_text: function to be called when extracting some text at some position.
It has five arguments: text, current transformation matrix,
text matrix, font-dictionary and font-size.
It has five arguments: text,
current transformation matrix, text matrix, font-dictionary and font-size.
The font-dictionary may be None in case of unknown fonts.
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
group_TJ: True for one call of visitor_text at each TJ,
False for calls of visitor_text at each text-fragment of TJ.

Returns:
The extracted text
Expand Down Expand Up @@ -1857,6 +1944,7 @@ def extract_text(
visitor_operand_before,
visitor_operand_after,
visitor_text,
group_TJ,
)

def extract_xform_text(
Expand All @@ -1867,12 +1955,15 @@ def extract_xform_text(
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
group_TJ: bool = True,
) -> str:
"""
Extract text from an XObject.

Args:
space_width: force default space width (if not extracted from font (default 200)
group_TJ: True for one call of visitor_text at each TJ,
False for calls of visitor_text at each text-fragment of TJ.

Returns:
The extracted text
Expand Down
Loading