From c8ba91488a8872431ffa15cbfee316b87bfb48b8 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 27 Feb 2024 23:18:11 +0100 Subject: [PATCH 1/4] NEW: add reattach_fields function parse page/document annotations for orphan fields and reattach them to AcroForm/Fields closes #2453 --- pypdf/_writer.py | 46 ++++++++++++++++++++++++++++++++++++++++++++ tests/test_writer.py | 21 ++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 258e5cab8..3de9347aa 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -932,6 +932,52 @@ def update_page_form_field_values( value if value in k[AA.AP]["/N"] else "/Off" ) + def reattach_fields( + self, page: Optional[PageObject] = None + ) -> List[DictionaryObject]: + """ + Parse annotations within the page looking for orphan fields and + reattach then into the Fields Structure + + Args: + page: page to analyze. + If none is provided, all pages will be analyzed + Returns: + list of reattached fields + """ + lst = [] + if page is None: + for p in self.pages: + lst += self.reattach_fields(p) + return lst + + try: + af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) + except KeyError: + af = DictionaryObject() + self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = af + try: + fields = cast(ArrayObject, af[InteractiveFormDictEntries.Fields]) + except KeyError: + fields = ArrayObject() + af[NameObject(InteractiveFormDictEntries.Fields)] = fields + + if "/Annots" not in page: + return lst + annots = cast(ArrayObject, page["/Annots"]) + for idx in range(len(annots)): + ano = annots[idx] + indirect = isinstance(ano, IndirectObject) + ano = cast(DictionaryObject, ano.get_object()) + if ano.get("/Subtype", "") == "/Widget" and "/FT" in ano: + if ano.indirect_reference in fields: + continue + if not indirect: + annots[idx] = self._add_object(ano) + fields.append(ano.indirect_reference) + lst.append(ano) + return lst + def clone_reader_document_root(self, reader: PdfReader) -> None: """ Copy the reader document root to the writer and all sub elements, diff --git a/tests/test_writer.py b/tests/test_writer.py index 7bf644b53..28a333782 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1978,3 +1978,24 @@ def create_number_pdf(n) -> BytesIO: for n, page in enumerate(reader.pages): text = page.extract_text() assert text == str(n) + + +@pytest.mark.enable_socket() +def test_reattach_fields(): + """ + Test Reattach function + addressed in #2453 + """ + url = "https://github.com/py-pdf/pypdf/files/14241368/ExampleForm.pdf" + name = "iss2453.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + for p in reader.pages: + writer.add_page(p) + assert len(writer.reattach_fields()) == 15 + assert len(writer.reattach_fields()) == 0 # nothing to append anymore + assert len(writer._root_object["/AcroForm"]["/Fields"]) == 15 + writer = PdfWriter(clone_from=reader) + assert len(writer.reattach_fields()) == 7 + writer.reattach_fields() + assert len(writer._root_object["/AcroForm"]["/Fields"]) == 15 From f23f01eb77d5fa0238873dd00c6b53d2fcd4c843 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 28 Feb 2024 08:05:15 +0100 Subject: [PATCH 2/4] test coverage --- pypdf/_writer.py | 5 ++++- tests/test_writer.py | 12 ++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 3de9347aa..9c268b908 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -970,7 +970,10 @@ def reattach_fields( indirect = isinstance(ano, IndirectObject) ano = cast(DictionaryObject, ano.get_object()) if ano.get("/Subtype", "") == "/Widget" and "/FT" in ano: - if ano.indirect_reference in fields: + if ( + "indirect_reference" in ano.__dict__ + and ano.indirect_reference in fields + ): continue if not indirect: annots[idx] = self._add_object(ano) diff --git a/tests/test_writer.py b/tests/test_writer.py index 28a333782..89c79f681 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1999,3 +1999,15 @@ def test_reattach_fields(): assert len(writer.reattach_fields()) == 7 writer.reattach_fields() assert len(writer._root_object["/AcroForm"]["/Fields"]) == 15 + + writer = PdfWriter() + for p in reader.pages: + writer.add_page(p) + ano = writer.pages[0]["/Annots"][0].get_object() + del ano.indirect_reference + writer.pages[0]["/Annots"][0] = ano + assert isinstance(writer.pages[0]["/Annots"][0], DictionaryObject) + assert len(writer.reattach_fields(writer.pages[0])) == 6 + assert isinstance(writer.pages[0]["/Annots"][0], IndirectObject) + del writer.pages[1]["/Annots"] + assert len(writer.reattach_fields(writer.pages[1])) == 0 From 9060ec560037d22cd9da1746f44e64f842e389e5 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 28 Feb 2024 20:18:53 +0100 Subject: [PATCH 3/4] adding documentation --- docs/user/forms.md | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/docs/user/forms.md b/docs/user/forms.md index 40ea1606a..ed36a35a5 100644 --- a/docs/user/forms.md +++ b/docs/user/forms.md @@ -43,9 +43,28 @@ Viewer to recompute the field's rendering, and may trigger a "save changes" dialog for users who open the generated PDF. ## A note about form fields and annotations - -The PDF form stores form fields as annotations with the subtype "\Widget". This means that the following two blocks of code will give fairly similar results: - +PDF forms have a dual-nature approach about the fields: +* within the root object, an `/AcroForm` structure exists. + Inside it you could find (optional): + - some global elements (Fonts, Ressources,...) + - some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch) + - `/XFA` that houses a form in XDP format (very specific XML that describes the form rendered by some viewers) ; the `/XFA` form overrides the page content. + - `/Fields` that houses array of indirect references that references TopMost(roots) _Field_ Objects +* within the page `/Annots`, you will spot `/Widget` annotations that defines the visual renderings. + +To flesh out this overview: +* the core specific properties of a fields are : + - `/FT` : Field Type (Button, Text, Choice, Signatures) + - `/T` : Partial Field Name (see PDF Reference for more details) + - `/V` : Field Value + - `/DV` : Default Field Value (used when reseting a form for exemple) +* in order to streamline readability, _Field_ Objects and _Widget_ Objects can be fused housing all properties. +* Field can be organised hierarchically, meaning one field can be placed under another. in such instances, the `/Parent` will stock an IndirectObject providing Bottom-Up links and `/Childs` is an array carrying IndirectObjects for Top-Down navigation ; _Widget_ Objects are still required for visual rendering ; to call upon them, use *full qualified field name* (with all the individual names of the parent objects are seperated by `.`) + For instance 2 (visual) fields both called _city_ but attached below _sender_ and _receiver_ ; the data full names will be _sender.city_ and _receiver.city_ +* When a field is repeated on multiple pages, the Field Object will have many _Widget_ Objects in `/Childs`. These objects are pure _widgets_, containing no _field_ specific data +* if Fields stores only hidden values, No _Widget_ are required. + +In _pypdf_ fields are extracted from the `/Fields` array ```python from pypdf import PdfReader @@ -59,7 +78,7 @@ from pypdf.constants import AnnotationDictionaryAttributes reader = PdfReader("form.pdf") fields = [] -for page in reader.pages: +for page in reader.pagesP: for annot in page.annotations: annot = annot.get_object() if annot[AnnotationDictionaryAttributes.Subtype] == "/Widget": @@ -69,3 +88,10 @@ for page in reader.pages: However, while similar, there are some very important differences between the two above blocks of code. Most importantly, the first block will return a list of Field objects, where as the second will return more generic dictionary-like objects. The objects lists will *mostly* reference the same object in the underlying PDF, meaning you'll find that `obj_taken_fom_first_list.indirect_reference == obj_taken_from _second_list.indirect_reference`. Field objects are generally more ergonomic, as the exposed data can be access via clearly named properties. However, the more generic dictionary-like objects will contain data that the Field object does not expose, such as the Rect (the widget's position on the page). So, which to use will depend on your use case. However, it's also important to note that the two lists do not *always* refer to the same underlying PDF objects. For example, if the form contains radio buttons, you will find that `reader.get_fields()` will get the parent object (the group of radio buttons) whereas `page.annotations` will return all the child objects (the individual radio buttons). + + +__Caution: +Remember that fields are not stored in pages: If you use `add_page()` the field structure is not copied. +It is recommended to use `.append() with the proper parameters`__ + +In case of missing _field_ objects in `/Fields`, `writer.reattach_fields()` will parse page(s) annotations and will reattach them. This fix can not guess intermediate fields and will not report fields using the same _name_ From cb6d88f628fc263c03694ac189baa5d5fcff6b46 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Wed, 28 Feb 2024 20:33:59 +0100 Subject: [PATCH 4/4] improve wording and formatting --- docs/user/forms.md | 58 +++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/docs/user/forms.md b/docs/user/forms.md index ed36a35a5..f7336c518 100644 --- a/docs/user/forms.md +++ b/docs/user/forms.md @@ -42,29 +42,36 @@ parameter is `True` by default for legacy compatibility, but this flags the PDF Viewer to recompute the field's rendering, and may trigger a "save changes" dialog for users who open the generated PDF. -## A note about form fields and annotations +## Some notes about form fields and annotations + PDF forms have a dual-nature approach about the fields: -* within the root object, an `/AcroForm` structure exists. + +* Within the root object, an `/AcroForm` structure exists. Inside it you could find (optional): - - some global elements (Fonts, Ressources,...) - - some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch) - - `/XFA` that houses a form in XDP format (very specific XML that describes the form rendered by some viewers) ; the `/XFA` form overrides the page content. - - `/Fields` that houses array of indirect references that references TopMost(roots) _Field_ Objects -* within the page `/Annots`, you will spot `/Widget` annotations that defines the visual renderings. + + - some global elements (Fonts, Resources,...) + - some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch) + - `/XFA` that houses a form in XDP format (very specific XML that describes the form rendered by some viewers); the `/XFA` form overrides the page content + - `/Fields` that houses an array of indirect references that reference the upper _Field_ Objects (roots) + +* Within the page `/Annots`, you will spot `/Widget` annotations that define the visual rendering. To flesh out this overview: -* the core specific properties of a fields are : - - `/FT` : Field Type (Button, Text, Choice, Signatures) - - `/T` : Partial Field Name (see PDF Reference for more details) - - `/V` : Field Value - - `/DV` : Default Field Value (used when reseting a form for exemple) -* in order to streamline readability, _Field_ Objects and _Widget_ Objects can be fused housing all properties. -* Field can be organised hierarchically, meaning one field can be placed under another. in such instances, the `/Parent` will stock an IndirectObject providing Bottom-Up links and `/Childs` is an array carrying IndirectObjects for Top-Down navigation ; _Widget_ Objects are still required for visual rendering ; to call upon them, use *full qualified field name* (with all the individual names of the parent objects are seperated by `.`) - For instance 2 (visual) fields both called _city_ but attached below _sender_ and _receiver_ ; the data full names will be _sender.city_ and _receiver.city_ -* When a field is repeated on multiple pages, the Field Object will have many _Widget_ Objects in `/Childs`. These objects are pure _widgets_, containing no _field_ specific data -* if Fields stores only hidden values, No _Widget_ are required. - -In _pypdf_ fields are extracted from the `/Fields` array + +* The core specific properties of a field are: + - `/FT`: Field Type (Button, Text, Choice, Signatures) + - `/T`: Partial Field Name (see PDF Reference for more details) + - `/V`: Field Value + - `/DV` : Default Field Value (used when resetting a form for example) +* In order to streamline readability, _Field_ Objects and _Widget_ Objects can be fused housing all properties. +* Fields can be organised hierarchically, id est one field can be placed under another. In such instances, the `/Parent` will have an IndirectObject providing Bottom-Up links and `/Childs` is an array carrying IndirectObjects for Top-Down navigation; _Widget_ Objects are still required for visual rendering. To call upon them, use the *fully qualified field name* (where all the individual names of the parent objects are seperated by `.`) + + For instance take two (visual) fields both called _city_, but attached below _sender_ and _receiver_; the corresponding full names will be _sender.city_ and _receiver.city_. +* When a field is repeated on multiple pages, the Field Object will have many _Widget_ Objects in `/Childs`. These objects are pure _widgets_, containing no _field_ specific data. +* If Fields stores only hidden values, no _Widgets_ are required. + +In _pypdf_ fields are extracted from the `/Fields` array: + ```python from pypdf import PdfReader @@ -78,20 +85,17 @@ from pypdf.constants import AnnotationDictionaryAttributes reader = PdfReader("form.pdf") fields = [] -for page in reader.pagesP: +for page in reader.pages: for annot in page.annotations: annot = annot.get_object() if annot[AnnotationDictionaryAttributes.Subtype] == "/Widget": fields.append(annot) ``` -However, while similar, there are some very important differences between the two above blocks of code. Most importantly, the first block will return a list of Field objects, where as the second will return more generic dictionary-like objects. The objects lists will *mostly* reference the same object in the underlying PDF, meaning you'll find that `obj_taken_fom_first_list.indirect_reference == obj_taken_from _second_list.indirect_reference`. Field objects are generally more ergonomic, as the exposed data can be access via clearly named properties. However, the more generic dictionary-like objects will contain data that the Field object does not expose, such as the Rect (the widget's position on the page). So, which to use will depend on your use case. - -However, it's also important to note that the two lists do not *always* refer to the same underlying PDF objects. For example, if the form contains radio buttons, you will find that `reader.get_fields()` will get the parent object (the group of radio buttons) whereas `page.annotations` will return all the child objects (the individual radio buttons). +However, while similar, there are some very important differences between the two above blocks of code. Most importantly, the first block will return a list of Field objects, whereas the second will return more generic dictionary-like objects. The objects lists will *mostly* reference the same object in the underlying PDF, meaning you'll find that `obj_taken_fom_first_list.indirect_reference == obj_taken_from _second_list.indirect_reference`. Field objects are generally more ergonomic, as the exposed data can be accessed via clearly named properties. However, the more generic dictionary-like objects will contain data that the Field object does not expose, such as the Rect (the widget's position on the page). Therefore the correct approach depends on your use case. +However, it's also important to note that the two lists do not *always* refer to the same underlying PDF object. For example, if the form contains radio buttons, you will find that `reader.get_fields()` will get the parent object (the group of radio buttons) whereas `page.annotations` will return all the child objects (the individual radio buttons). -__Caution: -Remember that fields are not stored in pages: If you use `add_page()` the field structure is not copied. -It is recommended to use `.append() with the proper parameters`__ +__Caution: Remember that fields are not stored in pages: If you use `add_page()` the field structure is not copied. It is recommended to use `.append()` with the proper parameters instead.__ -In case of missing _field_ objects in `/Fields`, `writer.reattach_fields()` will parse page(s) annotations and will reattach them. This fix can not guess intermediate fields and will not report fields using the same _name_ +In case of missing _field_ objects in `/Fields`, `writer.reattach_fields()` will parse page(s) annotations and will reattach them. This fix can not guess intermediate fields and will not report fields using the same _name_.