diff --git a/pypdf/_page.py b/pypdf/_page.py index 11507de96..941b833bc 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -2183,6 +2183,7 @@ def _layout_mode_text( scale_weight: float = 1.25, strip_rotated: bool = True, debug_path: Optional[Path] = None, + font_height_weight: float = 1, ) -> str: """ Get text preserving fidelity to source PDF text layout. @@ -2202,6 +2203,8 @@ def _layout_mode_text( - bts.json: text render ops left justified and grouped by BT/ET operators - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) Defaults to None. + font_height_weight: multiplier for font height when calculating + blank lines. Defaults to 1. Returns: str: multiline string containing page text in a fixed width format that @@ -2232,7 +2235,7 @@ def _layout_mode_text( char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) - return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically) + return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight) def extract_text( self, @@ -2307,6 +2310,8 @@ def extract_text( - tjs.json: individual text render ops with corresponding transform matrices - bts.json: text render ops left justified and grouped by BT/ET operators - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) + layout_mode_font_height_weight (float): multiplier for font height when calculating + blank lines. Defaults to 1. Returns: The extracted text @@ -2329,6 +2334,7 @@ def extract_text( scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), strip_rotated=kwargs.get("layout_mode_strip_rotated", True), debug_path=kwargs.get("layout_mode_debug_path", None), + font_height_weight=kwargs.get("layout_mode_font_height_weight", 1) ) if len(args) >= 1: if isinstance(args[0], str): diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py index e7af1b234..ac05aa59a 100644 --- a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py +++ b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py @@ -338,7 +338,7 @@ def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> fl def fixed_width_page( - ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool + ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float ) -> str: """ Generate page text from text operations grouped by rendered y coordinate. @@ -347,6 +347,7 @@ def fixed_width_page( ty_groups: dict of text show ops as returned by y_coordinate_groups() char_width: fixed character width space_vertically: include blank lines inferred from y distance + font height. + font_height_weight: multiplier for font height when calculating blank lines. Returns: str: page text in a fixed width format that closely adheres to the rendered @@ -357,7 +358,7 @@ def fixed_width_page( for y_coord, line_data in ty_groups.items(): if space_vertically and lines: blank_lines = ( - int(abs(y_coord - last_y_coord) / line_data[0]["font_height"]) - 1 + int(abs(y_coord - last_y_coord) / (line_data[0]["font_height"] * font_height_weight)) - 1 ) lines.extend([""] * blank_lines) line = ""