Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Missing new line in extract_text with cm operations #2142

Merged
merged 8 commits into from
Sep 17, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1926,6 +1926,7 @@ def _extract_text(
# are strings where the byte->string encoding was unknown, so adding
# them to the text here would be gibberish.

cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
cm_stack = []
tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
Expand All @@ -1947,7 +1948,7 @@ def current_spacewidth() -> float:
return _space_width / 1000.0

def process_operation(operator: bytes, operands: List) -> None:
nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, output, text
nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
nonlocal orientations, rtl_dir, visitor_text
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
Expand Down Expand Up @@ -2090,8 +2091,9 @@ def process_operation(operator: bytes, operands: List) -> None:
return None
if check_crlf_space:
try:
text, output, tm_prev = crlf_space_check(
text, output, cm_prev, tm_prev = crlf_space_check(
text,
cm_prev,
tm_prev,
cm_matrix,
tm_matrix,
Expand Down
16 changes: 9 additions & 7 deletions pypdf/_text_extraction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def orient(m: List[float]) -> int:

def crlf_space_check(
text: str,
cm_prev: List[float],
tm_prev: List[float],
cm_matrix: List[float],
tm_matrix: List[float],
Expand All @@ -98,8 +99,8 @@ def crlf_space_check(
font_size: float,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
spacewidth: float,
) -> Tuple[str, str, List[float]]:
m_prev = mult(tm_prev, cm_matrix)
) -> Tuple[str, str, List[float], List[float]]:
m_prev = mult(tm_prev, cm_prev)
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
delta_x = m[4] - m_prev[4]
Expand All @@ -116,7 +117,7 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_matrix,
cm_prev,
tm_prev,
cmap[3],
font_size,
Expand All @@ -135,7 +136,7 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_matrix,
cm_prev,
tm_prev,
cmap[3],
font_size,
Expand All @@ -154,7 +155,7 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_matrix,
cm_prev,
tm_prev,
cmap[3],
font_size,
Expand All @@ -173,7 +174,7 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_matrix,
cm_prev,
tm_prev,
cmap[3],
font_size,
Expand All @@ -188,7 +189,8 @@ def crlf_space_check(
except Exception:
pass
tm_prev = tm_matrix.copy()
return text, output, tm_prev
cm_prev = cm_matrix.copy()
return text, output, cm_prev, tm_prev


def handle_tj(
Expand Down
18 changes: 18 additions & 0 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -1026,3 +1026,21 @@ def test_iss():
for i, page in enumerate(reader.pages):
print(i)
page.extract_text()


@pytest.mark.enable_socket()
def test_cr_with_cm_operation():
"""Issue #2138"""
url = "https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf"
name = "iss2138.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert (
"""STATUS: FNL
STYLE: 1172 1172 KNIT SHORTIE SUMMER-B 2023
Company: AMERICAN EAGLE OUTFITTERS
Division / Dept: 50 / 170
Season: SUMMER-B 2023"""
in reader.pages[0].extract_text()
)
# currently threre is still a white space on last line missing
# so we can not do a full comparison.
11 changes: 10 additions & 1 deletion tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
from io import BytesIO
from pathlib import Path

try:
from time import thread_time
except ImportError:

def thread_time():
return 0 # disable test for python 3.6


import pytest

from pypdf import (
Expand Down Expand Up @@ -1559,16 +1567,17 @@ def test_watermark():


@pytest.mark.enable_socket()
@pytest.mark.timeout(4) # this was a lot slower before PR #2086
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that one is the cleaner option. If we change it, it should be a separate PR (but I'm uncertain if we should change it at all).

You mentioned that you had issues with that test. Was it local or in CI? What exactly was the issue / the error message?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree it is not the good place, but wanted to discuss about.

The issue during the checks was just it failed with the timeout. It is not the first time this is why I've proposed this change.
I consider the time is ok but for me it is just because We are measuring too many instructions. That' why I've focused on the specif code of the merge.
If you like it just give me a go and I will generate a dedicated PR dealing with this change only

def test_watermarking_speed():
url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf"
name = "bgwatermark.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
url = "https://arxiv.org/pdf/2201.00214.pdf"
name = "src_doc.pdf"
t0 = thread_time()
writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name)))
for p in writer.pages:
p.merge_page(reader.pages[0], over=False)
assert (thread_time() - t0) < 4.0
out_pdf_bytesio = BytesIO()
writer.write(out_pdf_bytesio)
pdf_size_in_mib = len(out_pdf_bytesio.getvalue()) / 1024 / 1024
Expand Down