diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index fde795b01..20e8cdc42 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -50,7 +50,7 @@ def build_char_map_from_dict( Font sub-type, space_width criteria(50% of width), encoding, map character-map. The font-dictionary itself is suitable for the curious. """ - font_type: str = cast(str, ft["/Subtype"]) + font_type = cast(str, ft["/Subtype"].get_object()) space_code = 32 encoding, space_code = parse_encoding(ft, space_code) @@ -75,21 +75,12 @@ def build_char_map_from_dict( for x in int_entry: if x <= 255: encoding[x] = chr(x) - # I consider the space_code is available on one byte if isinstance(space_code, str): - try: # one byte - sp = space_code.encode("charmap")[0] - except Exception: - sp = space_code.encode("utf-16-be") - sp = sp[0] + 256 * sp[1] - try: - sp = ord(map_dict[chr(sp)]) - except KeyError: - pass - else: sp = space_code - font_width_map = build_font_width_map(ft, map_dict, space_width * 2.0) - half_space_width = compute_space_width(font_width_map, chr(sp)) / 2.0 + else: + sp = chr(space_code) + font_width_map = build_font_width_map(ft, space_width * 2.0) + half_space_width = compute_space_width(font_width_map, sp) / 2.0 return ( font_type, @@ -403,17 +394,14 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> def build_font_width_map( - ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any], default_font_width: float + ft: DictionaryObject, default_font_width: float ) -> Dict[Any, float]: font_width_map: Dict[Any, float] = {} st: int = 0 en: int = 0 - if ft is None: - font_width_map["default"] = default_font_width - return font_width_map try: - default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0 - except Exception: + default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object)] * 2.0 + except KeyError: pass if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): # ยง9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts") @@ -435,21 +423,13 @@ def build_font_width_map( # C_first C_last same_W en = second for c_code in range(st, en + 1): - try: - conversion_char = map_dict[chr(c_code)] - font_width_map[conversion_char] = w[2] - except KeyError: - pass + font_width_map[chr(c_code)] = w[2] w = w[3:] elif isinstance(second, list): # Starting_C [W1 W2 ... Wn] c_code = st for width in second: - try: - conversion_char = map_dict[chr(c_code)] - font_width_map[conversion_char] = width - except KeyError: - pass + font_width_map[chr(c_code)] = width c_code += 1 w = w[2:] else: diff --git a/pypdf/_page.py b/pypdf/_page.py index c49a68c33..93662f356 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -49,13 +49,21 @@ overload, ) -from ._cmap import build_char_map, build_font_width_map, compute_font_width, unknown_char_map +from ._cmap import ( + build_char_map, + build_font_width_map, + compute_font_width, + parse_encoding, + parse_to_unicode, + unknown_char_map, +) from ._protocols import PdfCommonDocProtocol from ._text_extraction import ( OrientationNotFoundError, _layout_mode, crlf_space_check, - handle_tj, + get_display_str, + get_text_operands, mult, ) from ._utils import ( @@ -84,6 +92,7 @@ PdfObject, RectangleObject, StreamObject, + TextStringObject, is_null_or_none, ) @@ -496,7 +505,7 @@ def __init__( if not is_null_or_none(indirect_reference): assert indirect_reference is not None, "mypy" self.update(cast(DictionaryObject, indirect_reference.get_object())) - self._font_width_maps: Dict[str, Dict[str, float]] = {} + self._font_width_maps: Dict[str, Tuple[Dict[str, float], str, float]] = {} def hash_bin(self) -> int: """ @@ -1722,19 +1731,78 @@ def _get_acutual_font_widths( cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], - add_text: str, + text_operands: str, font_size: float, - default_space_width: float + space_width: float ) -> Tuple[float, float, float]: font_widths: float = 0 font_name: str = cmap[2] if font_name not in self._font_width_maps: - self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1], default_space_width * 2) - font_width_map: Dict[Any, float] = self._font_width_maps[font_name] - if add_text: - for char in add_text: + if cmap[3] is None: + font_width_map: Dict[Any, float] = {} + space_char = " " + actual_space_width: float = space_width + font_width_map["default"] = actual_space_width * 2 + else: + space_code = 32 + _, space_code = parse_encoding(cmap[3], space_code) + _, space_code, _ = parse_to_unicode(cmap[3], space_code) + if isinstance(space_code, str): + space_char = space_code + else: + space_char = chr(space_code) + font_width_map = build_font_width_map(cmap[3], space_width * 2) + actual_space_width = compute_font_width(font_width_map, space_char) + if actual_space_width == 0: + actual_space_width = space_width + self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width) + font_width_map = self._font_width_maps[font_name][0] + space_char = self._font_width_maps[font_name][1] + actual_space_width = self._font_width_maps[font_name][2] + + if text_operands: + for char in text_operands: + if char == space_char: + font_widths += actual_space_width + continue font_widths += compute_font_width(font_width_map, char) - return (font_widths * font_size, default_space_width * font_size, font_size) + return (font_widths * font_size, space_width * font_size, font_size) + + def _handle_tj( + self, + text: str, + operands: List[Union[str, TextStringObject]], + cm_matrix: List[float], + tm_matrix: List[float], + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] + ], + orientations: Tuple[int, ...], + font_size: float, + rtl_dir: bool, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], + space_width: float, + actual_str_size: Dict[str, float] + ) -> Tuple[str, bool, Dict[str, float]]: + text_operands, is_str_operands = get_text_operands( + operands, cm_matrix, tm_matrix, cmap, orientations) + if is_str_operands: + text += text_operands + else: + text, rtl_dir = get_display_str( + text, + cm_matrix, + tm_matrix, # text matrix + cmap, + text_operands, + font_size, + rtl_dir, + visitor_text) + font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = ( + self._get_acutual_font_widths(cmap, text_operands, font_size, space_width)) + actual_str_size["str_widths"] += font_widths + + return text, rtl_dir, actual_str_size def _extract_text( self, @@ -1818,11 +1886,8 @@ def _extract_text( TL = 0.0 font_size = 12.0 # init just in case of - def current_spacewidth() -> float: - return _space_width / 1000.0 - - def current_strwidths() -> float: - return _actual_str_size["str_widths"] / 1000.0 + def compute_strwidths(str_widths: float) -> float: + return str_widths / 1000.0 def process_operation(operator: bytes, operands: List[Any]) -> None: nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm @@ -1945,7 +2010,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: ty = float(operands[1]) tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2] tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3] - str_widths = current_strwidths() + str_widths = compute_strwidths(_actual_str_size["str_widths"]) _actual_str_size["str_widths"] = 0.0 elif operator == b"Tm": check_crlf_space = True @@ -1957,28 +2022,26 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: float(operands[4]), float(operands[5]), ] - str_widths = current_strwidths() + str_widths = compute_strwidths(_actual_str_size["str_widths"]) _actual_str_size["str_widths"] = 0.0 elif operator == b"T*": check_crlf_space = True tm_matrix[5] -= TL elif operator == b"Tj": check_crlf_space = True - text, rtl_dir, add_text = handle_tj( + text, rtl_dir, _actual_str_size = self._handle_tj( text, operands, cm_matrix, tm_matrix, # text matrix cmap, orientations, - output, font_size, rtl_dir, visitor_text, + _space_width, + _actual_str_size, ) - current_font_widths, _actual_str_size["space_width"], _actual_str_size["str_height"] = ( - self._get_acutual_font_widths(cmap, add_text, font_size, current_spacewidth())) - _actual_str_size["str_widths"] += current_font_widths else: return None if check_crlf_space: @@ -1994,7 +2057,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: font_size, visitor_text, str_widths, - _actual_str_size["space_width"], + compute_strwidths(_actual_str_size["space_width"]), _actual_str_size["str_height"] ) if text == "": diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index a1c0d1d91..72d492f6a 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -153,29 +153,25 @@ def crlf_space_check( return text, output, cm_prev, tm_prev -def handle_tj( - text: str, +def get_text_operands( operands: List[Union[str, TextStringObject]], cm_matrix: List[float], tm_matrix: List[float], cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], - orientations: Tuple[int, ...], - output: str, - font_size: float, - rtl_dir: bool, - visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], -) -> Tuple[str, bool, str]: - add_text = "" + orientations: Tuple[int, ...] +) -> Tuple[str, bool]: + t: str = "" + is_str_operands = False m = mult(tm_matrix, cm_matrix) orientation = orient(m) if orientation in orientations and len(operands) > 0: if isinstance(operands[0], str): - text += operands[0] - add_text = operands[0] + t = operands[0] + is_str_operands = True else: - t: str = "" + t = "" tt: bytes = ( encode_pdfdocencoding(operands[0]) if isinstance(operands[0], str) @@ -196,47 +192,56 @@ def handle_tj( t = "".join( [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt] ) - # "\u0590 - \u08FF \uFB50 - \uFDFF" - for x in [cmap[1][x] if x in cmap[1] else x for x in t]: - # x can be a sequence of bytes ; ex: habibi.pdf - if len(x) == 1: - xx = ord(x) - else: - xx = 1 - # fmt: off - if ( - # cases where the current inserting order is kept - (xx <= 0x2F) # punctuations but... - or 0x3A <= xx <= 0x40 # numbers (x30-39) - or 0x2000 <= xx <= 0x206F # upper punctuations.. - or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents - or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... - ): - text = x + text if rtl_dir else text + x - add_text = x if rtl_dir else add_text + x - elif ( # right-to-left characters set - 0x0590 <= xx <= 0x08FF - or 0xFB1D <= xx <= 0xFDFF - or 0xFE70 <= xx <= 0xFEFF - or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX - ): - if not rtl_dir: - rtl_dir = True - output += text - if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) - text = "" - text = x + text - add_text = x + add_text - else: # left-to-right - # print(">",xx,x,end="") - if rtl_dir: - rtl_dir = False - output += text - if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) - text = "" - text = text + x - add_text += x - # fmt: on - return text, rtl_dir, add_text + return (t, is_str_operands) + + +def get_display_str( + text: str, + cm_matrix: List[float], + tm_matrix: List[float], + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] + ], + text_operands: str, + font_size: float, + rtl_dir: bool, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] +) -> Tuple[str, bool]: + # "\u0590 - \u08FF \uFB50 - \uFDFF" + for x in [cmap[1].get(x, x) for x in text_operands]: + # x can be a sequence of bytes ; ex: habibi.pdf + if len(x) == 1: + xx = ord(x) + else: + xx = 1 + # fmt: off + if ( + # cases where the current inserting order is kept + (xx <= 0x2F) # punctuations but... + or 0x3A <= xx <= 0x40 # numbers (x30-39) + or 0x2000 <= xx <= 0x206F # upper punctuations.. + or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents + or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... + ): + text = x + text if rtl_dir else text + x + elif ( # right-to-left characters set + 0x0590 <= xx <= 0x08FF + or 0xFB1D <= xx <= 0xFDFF + or 0xFE70 <= xx <= 0xFEFF + or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX + ): + if not rtl_dir: + rtl_dir = True + if visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + text = "" + text = x + text + else: # left-to-right + if rtl_dir: + rtl_dir = False + if visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + text = "" + text = text + x + # fmt: on + return text, rtl_dir