Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: invalid cm/tm in visitor functions #2206

Merged
merged 15 commits into from
Oct 8, 2023
Merged
46 changes: 31 additions & 15 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1946,7 +1946,10 @@ def _extract_text(
1.0,
0.0,
0.0,
] # will store previous tm_matrix
] # will store cm_matrix * tm_matrix
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the old comment was more correct, right? This change was probably a side effect of the revert.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct we store inhere only tm_matrix

# memo_cm/tm will be used to store the position at the beginning of building the text
memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
char_scale = 1.0
space_scale = 1.0
_space_width: float = 500.0 # will be set correctly at first Tf
Expand All @@ -1957,9 +1960,9 @@ def current_spacewidth() -> float:
return _space_width / 1000.0

def process_operation(operator: bytes, operands: List) -> None:
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, output, text
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
nonlocal orientations, rtl_dir, visitor_text
nonlocal orientations, rtl_dir, visitor_text, output, text
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS

check_crlf_space: bool = False
Expand All @@ -1968,14 +1971,18 @@ def process_operation(operator: bytes, operands: List) -> None:
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
return None
elif operator == b"ET":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
# table 4.7 "Graphics state operators", page 219
# cm_matrix calculation is a reserved for the moment
elif operator == b"q":
Expand Down Expand Up @@ -2006,8 +2013,10 @@ def process_operation(operator: bytes, operands: List) -> None:
elif operator == b"cm":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
cm_matrix = mult(
[
float(operands[0]),
Expand All @@ -2030,8 +2039,10 @@ def process_operation(operator: bytes, operands: List) -> None:
if text != "":
output += text # .translate(cmap)
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
try:
# charMapTuple: font_type, float(sp_width / 2), encoding,
# map_dict, font-dictionary
Expand Down Expand Up @@ -2102,17 +2113,19 @@ def process_operation(operator: bytes, operands: List) -> None:
try:
text, output, cm_prev, tm_prev = crlf_space_check(
text,
cm_prev,
tm_prev,
cm_matrix,
tm_matrix,
(cm_prev, tm_prev),
(cm_matrix, tm_matrix),
(memo_cm, memo_tm),
cmap,
orientations,
output,
font_size,
visitor_text,
current_spacewidth(),
)
if text == "":
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
except OrientationNotFoundError:
return None

Expand Down Expand Up @@ -2144,12 +2157,12 @@ def process_operation(operator: bytes, operands: List) -> None:
elif operator == b"Do":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
try:
if output[-1] != "\n":
output += "\n"
if visitor_text is not None:
visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text("\n", memo_cm, memo_tm, cmap[3], font_size)
except IndexError:
pass
try:
Expand All @@ -2165,21 +2178,24 @@ def process_operation(operator: bytes, operands: List) -> None:
)
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
except Exception:
logger_warning(
f" impossible to decode XFormObject {operands[0]}",
__name__,
)
finally:
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()

else:
process_operation(operator, operands)
if visitor_operand_after is not None:
visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
output += text # just in case of
if text != "" and visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
return output

def extract_text(
Expand Down
31 changes: 19 additions & 12 deletions pypdf/_text_extraction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,9 @@ def orient(m: List[float]) -> int:

def crlf_space_check(
text: str,
cm_prev: List[float],
tm_prev: List[float],
cm_matrix: List[float],
tm_matrix: List[float],
cmtm_prev: Tuple[List[float], List[float]],
cmtm_matrix: Tuple[List[float], List[float]],
memo_cmtm: Tuple[List[float], List[float]],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
Expand All @@ -100,13 +99,21 @@ def crlf_space_check(
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
spacewidth: float,
) -> Tuple[str, str, List[float], List[float]]:
cm_prev = cmtm_prev[0]
tm_prev = cmtm_prev[1]
cm_matrix = cmtm_matrix[0]
tm_matrix = cmtm_matrix[1]
memo_cm = memo_cmtm[0]
memo_tm = memo_cmtm[1]

m_prev = mult(tm_prev, cm_prev)
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
delta_x = m[4] - m_prev[4]
delta_y = m[5] - m_prev[5]
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
f = font_size * k
cm_prev = m
Copy link
Contributor

@troethe troethe Sep 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't look quite right. I think this line can however be safely removed, because cm_prev doesn't get accessed until it is being assigned to again at the end of this function (i.e. this is currently a noop).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree.
Good point to have noticed it . Actually I've forgot to pass the multiplied cm_matrix to the visitor

if orientation not in orientations:
raise OrientationNotFoundError
try:
Expand All @@ -117,8 +124,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -136,8 +143,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -155,8 +162,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -174,8 +181,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand Down
78 changes: 78 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1288,3 +1288,81 @@ def test_get_contents_from_nullobject():
p = writer.add_blank_page(100, 100)
p[NameObject("/Contents")] = writer._add_object(NullObject())
p.get_contents()


@pytest.mark.enable_socket()
def test_pos_text_in_textvisitor():
"""See #2200"""
url = "https://github.com/py-pdf/pypdf/files/12675974/page_178.pdf"
name = "test_text_pos.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
p = ()

def visitor_body2(text, cm, tm, fontdict, fontsize) -> None:
nonlocal p
if text.startswith("5425."):
p = (tm[4], tm[5])

reader.pages[0].extract_text(visitor_text=visitor_body2)
assert abs(p[0] - 323.5) < 0.1
assert abs(p[1] - 457.4) < 0.1


@pytest.mark.enable_socket()
def test_pos_text_in_textvisitor2():
"""See #2075"""
url = "https://github.com/py-pdf/pypdf/files/12318042/LegIndex-page6.pdf"
name = "LegIndex-page6.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
x_lvl = 26
lst = []

def visitor_lvl(text, cm, tm, fontdict, fontsize) -> None:
nonlocal x_lvl, lst
if abs(tm[4] - x_lvl) < 2 and tm[5] < 740 and tm[5] > 210:
lst.append(text.strip(" \n"))

reader.pages[0].extract_text(visitor_text=visitor_lvl)
assert lst == [
"ACUPUNCTURE BOARD",
"ACUPUNCTURISTS AND ACUPUNCTURE",
"ADMINISTRATIVE LAW AND PROCEDURE",
"ADMINISTRATIVE LAW, OFFICE OF",
"ADOPTION",
"ADULT EDUCATION",
"ADVERTISING. See also MARKETING; and particular subject matter (e.g.,",
]
x_lvl = 35
lst = []
reader.pages[0].extract_text(visitor_text=visitor_lvl)
assert lst == [
"members, AB 1264",
"assistants, acupuncture, AB 1264",
"complaints, investigations, etc., AB 1264",
"day, california acupuncture, HR 48",
"massage services, asian, AB 1264",
"supervising acupuncturists, AB 1264",
"supportive acupuncture services, basic, AB 1264",
"rules and regulations—",
"professional assistants and employees: employment and compensation, AB 916",
"adults, adoption of, AB 1756",
"agencies, organizations, etc.: requirements, prohibitions, etc., SB 807",
"assistance programs, adoption: nonminor dependents, SB 9",
"birth certificates, AB 1302",
"contact agreements, postadoption—",
"facilitators, adoption, AB 120",
"failed adoptions: reproductive loss leave, SB 848",
"hearings, adoption finalization: remote proceedings, technology, etc., SB 21",
"native american tribes, AB 120",
"parental rights, reinstatement of, AB 20",
"parents, prospective adoptive: criminal background checks, SB 824",
"services, adult educational, SB 877",
"week, adult education, ACR 31",
"alcoholic beverages: tied-house restrictions, AB 546",
"campaign re social equity, civil rights, etc., SB 447",
"cannabis, AB 794",
"elections. See ELECTIONS.",
"false, misleading, etc., advertising—",
"hotels, short-term rentals, etc., advertised rates: mandatory fee disclosures, SB 683",
"housing rental properties advertised rates: disclosures, SB 611",
]