diff --git a/pypdf/_page.py b/pypdf/_page.py index e4ec053c8..87b914ce2 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1985,11 +1985,13 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: process_operation(b"TL", [-operands[1]]) process_operation(b"Td", operands) elif operator == b"TJ": + # The space width may be smaller than the font width, so the width should be 95%. + _confirm_space_width = _space_width * 0.95 for op in operands[0]: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - (abs(float(op)) >= _space_width) + (abs(float(op)) >= _confirm_space_width) and (len(text) > 0) and (text[-1] != " ") ): diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 2f0eaad1d..8bfa1809e 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -189,3 +189,14 @@ def test_layout_mode_warnings(mock_logger_warning): mock_logger_warning.assert_called_with( "Argument visitor_text is ignored in layout mode", "pypdf._page" ) + + +@pytest.mark.enable_socket() +def test_space_with_one_unit_smaller_than_font_width(): + """Tests for #1328""" + url = "https://github.com/py-pdf/pypdf/files/9498481/0004.pdf" + name = "iss1328.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + page = reader.pages[0] + extracted = page.extract_text() + assert "Reporting crude oil leak.\n" in extracted