py-pdf · MartinThoma · Sep 17, 2023 · Sep 3, 2023 · Sep 3, 2023 · Sep 3, 2023
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -1926,6 +1926,7 @@ def _extract_text(
         # are strings where the byte->string encoding was unknown, so adding
         # them to the text here would be gibberish.
 
+        cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
         cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
         cm_stack = []
         tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
@@ -1947,7 +1948,7 @@ def current_spacewidth() -> float:
             return _space_width / 1000.0
 
         def process_operation(operator: bytes, operands: List) -> None:
-            nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text
+            nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, output, text
             nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
             nonlocal orientations, rtl_dir, visitor_text
             global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
@@ -2090,8 +2091,9 @@ def process_operation(operator: bytes, operands: List) -> None:
                 return None
             if check_crlf_space:
                 try:
-                    text, output, tm_prev = crlf_space_check(
+                    text, output, cm_prev, tm_prev = crlf_space_check(
                         text,
+                        cm_prev,
                         tm_prev,
                         cm_matrix,
                         tm_matrix,

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
@@ -87,6 +87,7 @@ def orient(m: List[float]) -> int:
 
 def crlf_space_check(
     text: str,
+    cm_prev: List[float],
     tm_prev: List[float],
     cm_matrix: List[float],
     tm_matrix: List[float],
@@ -98,8 +99,8 @@ def crlf_space_check(
     font_size: float,
     visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
     spacewidth: float,
-) -> Tuple[str, str, List[float]]:
-    m_prev = mult(tm_prev, cm_matrix)
+) -> Tuple[str, str, List[float], List[float]]:
+    m_prev = mult(tm_prev, cm_prev)
     m = mult(tm_matrix, cm_matrix)
     orientation = orient(m)
     delta_x = m[4] - m_prev[4]
@@ -116,7 +117,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_matrix,
+                            cm_prev,
                             tm_prev,
                             cmap[3],
                             font_size,
@@ -135,7 +136,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_matrix,
+                            cm_prev,
                             tm_prev,
                             cmap[3],
                             font_size,
@@ -154,7 +155,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_matrix,
+                            cm_prev,
                             tm_prev,
                             cmap[3],
                             font_size,
@@ -173,7 +174,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_matrix,
+                            cm_prev,
                             tm_prev,
                             cmap[3],
                             font_size,
@@ -188,7 +189,8 @@ def crlf_space_check(
     except Exception:
         pass
     tm_prev = tm_matrix.copy()
-    return text, output, tm_prev
+    cm_prev = cm_matrix.copy()
+    return text, output, cm_prev, tm_prev
 
 
 def handle_tj(

diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -1026,3 +1026,21 @@ def test_iss():
     for i, page in enumerate(reader.pages):
         print(i)
         page.extract_text()
+
+
+@pytest.mark.enable_socket()
+def test_cr_with_cm_operation():
+    """Issue #2138"""
+    url = "https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf"
+    name = "iss2138.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    assert (
+        """STATUS: FNL
+STYLE: 1172 1172 KNIT SHORTIE SUMMER-B 2023
+Company: AMERICAN EAGLE OUTFITTERS
+Division / Dept: 50 / 170
+Season: SUMMER-B 2023"""
+        in reader.pages[0].extract_text()
+    )
+    # currently threre is still a white space on last line missing
+    # so we can not do a full comparison.
diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -3,6 +3,14 @@
 from io import BytesIO
 from pathlib import Path
 
+try:
+    from time import thread_time
+except ImportError:
+
+    def thread_time():
+        return 0  # disable test for python 3.6
+
+
 import pytest
 
 from pypdf import (
@@ -1559,16 +1567,17 @@ def test_watermark():
 
 
 @pytest.mark.enable_socket()
-@pytest.mark.timeout(4)  # this was a lot slower before PR #2086
 def test_watermarking_speed():
     url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf"
     name = "bgwatermark.pdf"
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
     url = "https://arxiv.org/pdf/2201.00214.pdf"
     name = "src_doc.pdf"
+    t0 = thread_time()
     writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name)))
     for p in writer.pages:
         p.merge_page(reader.pages[0], over=False)
+    assert (thread_time() - t0) < 4.0
     out_pdf_bytesio = BytesIO()
     writer.write(out_pdf_bytesio)
     pdf_size_in_mib = len(out_pdf_bytesio.getvalue()) / 1024 / 1024