py-pdf · MartinThoma · Aug 13, 2023 · Jul 30, 2023 · Jul 30, 2023 · Jul 30, 2023
diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -17,15 +17,36 @@ def build_char_map(
     Determine information about a font.
 
     Args:
-        font_name:
-        space_width:
-        obj:
+        font_name: font name as a string
+        space_width: default space width if no data is found.
+        obj: XObject or Page where you can find a /Resource dictionary
 
     Returns:
-        Font sub-type, space_width/2, encoding, map character-map, font-dictionary.
+        Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.
         The font-dictionary itself is suitable for the curious.
     """
     ft: DictionaryObject = obj["/Resources"]["/Font"][font_name]  # type: ignore
+    font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(
+        space_width, ft
+    )
+    return font_subtype, font_halfspace, font_encoding, font_map, ft
+
+
+def build_char_map_from_dict(
+    space_width: float, ft: DictionaryObject
+) -> Tuple[str, float, Union[str, Dict[int, str]], Dict]:
+    """
+    Determine information about a font.
+
+    Args:
+        space_width: default space with if no data found
+             (normally half the width of a character).
+        ft: Font Dictionary
+
+    Returns:
+        Font sub-type, space_width criteria(50% of width), encoding, map character-map.
+        The font-dictionary itself is suitable for the curious.
+    """
     font_type: str = cast(str, ft["/Subtype"])
 
     space_code = 32
@@ -73,7 +94,6 @@ def build_char_map(
         encoding,
         # https://github.com/python/mypy/issues/4374
         map_dict,
-        ft,
     )
 
 

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -54,6 +54,7 @@
     cast,
 )
 
+from ._cmap import build_char_map_from_dict
 from ._encryption import EncryptAlgorithm, Encryption
 from ._page import PageObject, _VirtualList
 from ._page_labels import nums_clear_range, nums_insert, nums_next
@@ -847,6 +848,45 @@ def _update_text_field(self, field: DictionaryObject) -> None:
             da = " ".join(font_properties)
         y_offset = rct.height - 1 - font_height
 
+        # Retrieve font information from local DR ...
+        dr: Any = cast(
+            DictionaryObject,
+            cast(DictionaryObject, field.get("/DR", DictionaryObject())).get_object(),
+        )
+        dr = dr.get("/Font", DictionaryObject()).get_object()
+        if font_name not in dr:
+            # ...or AcroForm dictionary
+            dr = cast(
+                dict,
+                cast(DictionaryObject, self._root_object["/AcroForm"]).get("/DR", {}),
+            )
+            if isinstance(dr, IndirectObject):  # pragma: no cover
+                dr = dr.get_object()
+            dr = dr.get("/Font", DictionaryObject()).get_object()
+        font_res = dr.get(font_name)
+        if font_res is not None:
+            font_res = cast(DictionaryObject, font_res.get_object())
+            font_subtype, _, font_encoding, font_map = build_char_map_from_dict(
+                200, font_res
+            )
+            try:  # get rid of width stored in -1 key
+                del font_map[-1]
+            except KeyError:
+                pass
+            font_full_rev: Dict[str, bytes]
+            if isinstance(font_encoding, str):
+                font_full_rev = {
+                    v: k.encode(font_encoding) for k, v in font_map.items()
+                }
+            else:
+                font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()}
+                font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()}
+                for kk, v in font_map.items():
+                    font_full_rev[v] = font_encoding_rev.get(kk, kk)
+        else:
+            logger_warning(f"Font dictionary for {font_name} not found.", __name__)
+            font_full_rev = {}
+
         # Retrieve field text and selected values
         field_flags = field.get(FA.Ff, 0)
         if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:
@@ -872,7 +912,13 @@ def _update_text_field(self, field: DictionaryObject) -> None:
             else:
                 # Td is a relative translation
                 ap_stream += f"0 {- font_height * 1.4} Td\n".encode()
-            ap_stream += b"(" + str(line).encode("UTF-8") + b") Tj\n"
+            enc_line: List[bytes] = [
+                font_full_rev.get(c, c.encode("utf-16-be")) for c in line
+            ]
+            if any(len(c) >= 2 for c in enc_line):
+                ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n"
+            else:
+                ap_stream += b"(" + b"".join(enc_line) + b") Tj\n"
         ap_stream += b"ET\nQ\nEMC\nQ\n"
 
         # Create appearance dictionary
@@ -886,22 +932,16 @@ def _update_text_field(self, field: DictionaryObject) -> None:
             }
         )
 
-        # Retrieve font information from AcroForm dictionary
-        dr: Any = cast(
-            dict, cast(DictionaryObject, self._root_object["/AcroForm"]).get("/DR", {})
-        )
-        if isinstance(dr, IndirectObject):
-            dr = dr.get_object()
-        dr = dr.get("/Font", {})
-        if isinstance(dr, IndirectObject):
-            dr = dr.get_object()
-
         # Update Resources with font information if necessary
-        if font_name in dr:
+        if font_res is not None:
             dct[NameObject("/Resources")] = DictionaryObject(
                 {
                     NameObject("/Font"): DictionaryObject(
-                        {NameObject(font_name): dr[font_name].indirect_reference}
+                        {
+                            NameObject(font_name): getattr(
+                                font_res, "indirect_reference", font_res
+                            )
+                        }
                     )
                 }
             )

diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -1591,6 +1591,29 @@ def test_missing_info():
     assert len(writer.pages) == len(reader.pages)
 
 
+@pytest.mark.enable_socket()
+def test_germanfields():
+    """Cf #2035"""
+    url = "https://github.com/py-pdf/pypdf/files/12194195/test.pdf"
+    name = "germanfields.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    writer = PdfWriter(clone_from=reader)
+    form_fields = {"Text Box 1": "test æ ø å"}
+    writer.update_page_form_field_values(
+        writer.pages[0], form_fields, auto_regenerate=False
+    )
+    bytes_stream = BytesIO()
+    writer.write(bytes_stream)
+    bytes_stream.seek(0)
+    reader2 = PdfReader(bytes_stream)
+    assert (
+        b"test \xe6 \xf8 \xe5"
+        in reader2.get_fields()["Text Box 1"]
+        .indirect_reference.get_object()["/AP"]["/N"]
+        .get_data()
+    )
+
+
 @pytest.mark.enable_socket()
 def test_no_t_in_articles():
     """Cf #2078"""