From cb146e81eabec75e01380f881cd7de5c337df091 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 2 Mar 2024 17:16:41 +0100 Subject: [PATCH] ENH: add get_pages_from_field (#2494) * DEV: add _get_page_number_from_indirect in writer create similar function to have same API as in reader used in future dev --------- Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- docs/user/forms.md | 8 +- pypdf/_reader.py | 74 +++++++++++++++++- pypdf/_writer.py | 71 +++++++++++++++++ tests/test_workflows.py | 166 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 316 insertions(+), 3 deletions(-) diff --git a/docs/user/forms.md b/docs/user/forms.md index f7336c518..7fb932813 100644 --- a/docs/user/forms.md +++ b/docs/user/forms.md @@ -50,7 +50,7 @@ PDF forms have a dual-nature approach about the fields: Inside it you could find (optional): - some global elements (Fonts, Resources,...) - - some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch) + - some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_page_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch) - `/XFA` that houses a form in XDP format (very specific XML that describes the form rendered by some viewers); the `/XFA` form overrides the page content - `/Fields` that houses an array of indirect references that reference the upper _Field_ Objects (roots) @@ -99,3 +99,9 @@ However, it's also important to note that the two lists do not *always* refer to __Caution: Remember that fields are not stored in pages: If you use `add_page()` the field structure is not copied. It is recommended to use `.append()` with the proper parameters instead.__ In case of missing _field_ objects in `/Fields`, `writer.reattach_fields()` will parse page(s) annotations and will reattach them. This fix can not guess intermediate fields and will not report fields using the same _name_. + +## Identify pages where fields are used + +On order to ease locating page fields you can use `page.get_pages_using_field`. This methods accepts a field object, id est a *PdfObject* that represents a field (as are extracted from `_root_object["/AcroForm"]["/Fields"]`. The method returns a list of pages, because a field can have multiple widgets as mentioned previously (e.g. radio buttons or text displayed on multiple pages). + +The page numbers can then be retrieved as usual by using `page.page_number`. diff --git a/pypdf/_reader.py b/pypdf/_reader.py index a2ec36288..230852653 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -667,6 +667,76 @@ def indexed_key(k: str, fields: Dict[Any, Any]) -> str: ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V") return ff + def get_pages_showing_field( + self, field: Union[Field, PdfObject, IndirectObject] + ) -> List[PageObject]: + """ + Provides list of pages where the field is called. + + Args: + field: Field Object, PdfObject or IndirectObject referencing a Field + + Returns: + List of pages: + - Empty list: + The field has no widgets attached + (either hidden field or ancestor field). + - Single page list: + Page where the widget is present + (most common). + - Multi-page list: + Field with multiple kids widgets + (example: radio buttons, field repeated on multiple pages). + """ + + def _get_inherited(obj: DictionaryObject, key: str) -> Any: + if key in obj: + return obj[key] + elif "/Parent" in obj: + return _get_inherited( + cast(DictionaryObject, obj["/Parent"].get_object()), key + ) + else: + return None + + try: + # to cope with all types + field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore + except Exception as exc: + raise ValueError("field type is invalid") from exc + if _get_inherited(field, "/FT") is None: + raise ValueError("field is not valid") + ret = [] + if field.get("/Subtype", "") == "/Widget": + if "/P" in field: + ret = [field["/P"].get_object()] + else: + ret = [ + p + for p in self.pages + if field.indirect_reference in p.get("/Annots", "") + ] + else: + kids = field.get("/Kids", ()) + for k in kids: + k = k.get_object() + if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k): + # Kid that is just a widget, not a field: + if "/P" in k: + ret += [k["/P"].get_object()] + else: + ret += [ + p + for p in self.pages + if k.indirect_reference in p.get("/Annots", "") + ] + return [ + x + if isinstance(x, PageObject) + else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore + for x in ret + ] + def _get_named_destinations( self, tree: Union[TreeObject, None] = None, @@ -1813,7 +1883,9 @@ def decrypt(self, password: Union[str, bytes]) -> PasswordType: def decode_permissions(self, permissions_code: int) -> Dict[str, bool]: """Take the permissions as an integer, return the allowed access.""" deprecate_with_replacement( - old_name="decode_permissions", new_name="user_access_permissions", removed_in="5.0.0" + old_name="decode_permissions", + new_name="user_access_permissions", + removed_in="5.0.0", ) permissions_mapping = { diff --git a/pypdf/_writer.py b/pypdf/_writer.py index b343af217..db529eb8c 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -96,6 +96,7 @@ DecodedStreamObject, Destination, DictionaryObject, + Field, Fit, FloatObject, IndirectObject, @@ -1003,6 +1004,76 @@ def reattach_fields( lst.append(ano) return lst + def get_pages_showing_field( + self, field: Union[Field, PdfObject, IndirectObject] + ) -> List[PageObject]: + """ + Provides list of pages where the field is called. + + Args: + field: Field Object, PdfObject or IndirectObject referencing a Field + + Returns: + List of pages: + - Empty list: + The field has no widgets attached + (either hidden field or ancestor field). + - Single page list: + Page where the widget is present + (most common). + - Multi-page list: + Field with multiple kids widgets + (example: radio buttons, field repeated on multiple pages). + """ + + def _get_inherited(obj: DictionaryObject, key: str) -> Any: + if key in obj: + return obj[key] + elif "/Parent" in obj: + return _get_inherited( + cast(DictionaryObject, obj["/Parent"].get_object()), key + ) + else: + return None + + try: + # to cope with all types + field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore + except Exception as exc: + raise ValueError("field type is invalid") from exc + if _get_inherited(field, "/FT") is None: + raise ValueError("field is not valid") + ret = [] + if field.get("/Subtype", "") == "/Widget": + if "/P" in field: + ret = [field["/P"].get_object()] + else: + ret = [ + p + for p in self.pages + if field.indirect_reference in p.get("/Annots", "") + ] + else: + kids = field.get("/Kids", ()) + for k in kids: + k = k.get_object() + if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k): + # Kid that is just a widget, not a field: + if "/P" in k: + ret += [k["/P"].get_object()] + else: + ret += [ + p + for p in self.pages + if k.indirect_reference in p.get("/Annots", "") + ] + return [ + x + if isinstance(x, PageObject) + else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore + for x in ret + ] + def clone_reader_document_root(self, reader: PdfReader) -> None: """ Copy the reader document root to the writer and all sub elements, diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 07394adc7..9a4502c46 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -17,7 +17,14 @@ from pypdf import PdfMerger, PdfReader, PdfWriter from pypdf.constants import PageAttributes as PG from pypdf.errors import PdfReadError, PdfReadWarning -from pypdf.generic import ContentStream, NameObject, read_object +from pypdf.generic import ( + ArrayObject, + ContentStream, + DictionaryObject, + NameObject, + TextStringObject, + read_object, +) from . import get_data_from_url, normalize_warnings @@ -1108,3 +1115,160 @@ def test_text_extraction_invalid_mode(): reader = PdfReader(pdf_path) with pytest.raises(ValueError, match="Invalid text extraction mode"): reader.pages[0].extract_text(extraction_mode="foo") # type: ignore + + +@pytest.mark.enable_socket() +def test_get_page_showing_field(): + """ + Uses testfile from #2452 in order to get fields on multiple pages, + choices boxes,... + """ + url = "https://github.com/py-pdf/pypdf/files/14031491/Form_Structure_v50.pdf" + name = "iss2452.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name))) + writer = PdfWriter(clone_from=reader) + + # validate with Field: only works on Reader (no get_fields on writer yet) + fld = reader.get_fields() + assert [ + p.page_number for p in reader.get_pages_showing_field(fld["FormVersion"]) + ] == [0] + + # validate with dictionary object + # NRCategory field is a radio box + assert [ + p.page_number + for p in reader.get_pages_showing_field( + reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object() + ) + ] == [0, 0, 0, 0, 0] + assert [ + p.page_number + for p in writer.get_pages_showing_field( + writer._root_object["/AcroForm"]["/Fields"][8].get_object() + ) + ] == [0, 0, 0, 0, 0] + + # validate with IndirectObject + # SiteID field is a textbox on multiple pages + assert [ + p.page_number + for p in reader.get_pages_showing_field( + reader.trailer["/Root"]["/AcroForm"]["/Fields"][99] + ) + ] == [0, 1] + assert [ + p.page_number + for p in writer.get_pages_showing_field( + writer._root_object["/AcroForm"]["/Fields"][99] + ) + ] == [0, 1] + # test directly on the widget: + assert [ + p.page_number + for p in reader.get_pages_showing_field( + reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1] + ) + ] == [1] + assert [ + p.page_number + for p in writer.get_pages_showing_field( + writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1] + ) + ] == [1] + + # Exceptions: + # Invalid Object + with pytest.raises(ValueError) as exc: + reader.get_pages_showing_field(None) + with pytest.raises(ValueError) as exc: + writer.get_pages_showing_field(None) + assert "field type is invalid" in exc.value.args[0] + + # Damage Field + del reader.trailer["/Root"]["/AcroForm"]["/Fields"][1].get_object()["/FT"] + del writer._root_object["/AcroForm"]["/Fields"][1].get_object()["/FT"] + with pytest.raises(ValueError) as exc: + reader.get_pages_showing_field( + reader.trailer["/Root"]["/AcroForm"]["/Fields"][1] + ) + with pytest.raises(ValueError) as exc: + writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1]) + assert "field is not valid" in exc.value.args[0] + + # missing Parent in field + del reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[ + "/Parent" + ] + del writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[ + "/Parent" + ] + with pytest.raises(ValueError) as exc: + reader.get_pages_showing_field( + reader.trailer["/Root"]["/AcroForm"]["/Fields"][1] + ) + with pytest.raises(ValueError) as exc: + writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1]) + + # remove "/P" (optional) + del reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()[ + "/P" + ] + del writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()["/P"] + assert [ + p.page_number + for p in reader.get_pages_showing_field( + reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1] + ) + ] == [0] + assert [ + p.page_number + for p in writer.get_pages_showing_field( + writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1] + ) + ] == [0] + assert [ + p.page_number + for p in reader.get_pages_showing_field( + reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object() + ) + ] == [0, 0, 0, 0, 0] + assert [ + p.page_number + for p in writer.get_pages_showing_field( + writer._root_object["/AcroForm"]["/Fields"][8].get_object() + ) + ] == [0, 0, 0, 0, 0] + + # Grouping fields + reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()[ + NameObject("/Kids") + ] = ArrayObject([reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]]) + del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/T"] + del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/P"] + del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/Subtype"] + writer._root_object["/AcroForm"]["/Fields"].append( + writer._add_object( + DictionaryObject( + { + NameObject("/T"): TextStringObject("grouping"), + NameObject("/FT"): NameObject("/Tx"), + NameObject("/Kids"): ArrayObject( + [reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]] + ), + } + ) + ) + ) + assert [ + p.page_number + for p in reader.get_pages_showing_field( + reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1] + ) + ] == [] + assert [ + p.page_number + for p in writer.get_pages_showing_field( + writer._root_object["/AcroForm"]["/Fields"][-1] + ) + ] == []