From 8ebd311a4088da81dbd59c7ecc7de13c4e86f595 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 14 Sep 2024 16:02:58 +0200 Subject: [PATCH] MAINT: Simplify test with None and NullObject (#2829) --- pypdf/_cmap.py | 7 ++++--- pypdf/_doc_common.py | 5 +++-- pypdf/_page.py | 21 +++++++++++++-------- pypdf/_page_labels.py | 19 ++++++++++++------- pypdf/_reader.py | 10 ++++++---- pypdf/_writer.py | 14 ++++++++------ pypdf/filters.py | 4 +--- pypdf/generic/__init__.py | 2 ++ pypdf/generic/_base.py | 10 ++++++++++ pypdf/generic/_data_structures.py | 11 +++++++---- pypdf/generic/_fit.py | 5 +++-- pypdf/generic/_viewerpref.py | 6 +++--- tests/test_generic.py | 16 ++++++++++++++++ 13 files changed, 88 insertions(+), 42 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 6c5996703..dcf3678bd 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -7,8 +7,8 @@ from .generic import ( DecodedStreamObject, DictionaryObject, - NullObject, StreamObject, + is_null_or_none, ) @@ -468,7 +468,7 @@ def compute_space_width( cpt += 1 sp_width = m / max(1, cpt) / 2 - if sp_width is None or isinstance(sp_width, NullObject): + if is_null_or_none(sp_width): sp_width = 0.0 return sp_width @@ -482,8 +482,9 @@ def type1_alternative( if "/FontDescriptor" not in ft: return map_dict, space_code, int_entry ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile") - if ft_desc is None: + if is_null_or_none(ft_desc): return map_dict, space_code, int_entry + assert ft_desc is not None, "mypy" txt = ft_desc.get_object().get_data() txt = txt.split(b"eexec\n")[0] # only clear part txt = txt.split(b"/Encoding")[1] # to get the encoding part diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 8d07098b4..55c6aad67 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -85,6 +85,7 @@ TreeObject, ViewerPreferences, create_string_object, + is_null_or_none, ) from .types import OutlineType, PagemodeType from .xmp import XmpInformation @@ -761,7 +762,7 @@ def _get_inherited(obj: DictionaryObject, key: str) -> Any: field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore except Exception as exc: raise ValueError("field type is invalid") from exc - if _get_inherited(field, "/FT") is None: + if is_null_or_none(_get_inherited(field, "/FT")): raise ValueError("field is not valid") ret = [] if field.get("/Subtype", "") == "/Widget": @@ -852,7 +853,7 @@ def _get_outline( return outline # ยง12.3.3 Document outline, entries in the outline dictionary - if lines is not None and "/First" in lines: + if not is_null_or_none(lines) and "/First" in lines: node = cast(DictionaryObject, lines["/First"]) self._namedDests = self._get_named_destinations() diff --git a/pypdf/_page.py b/pypdf/_page.py index 471256eec..e4ec053c8 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -84,6 +84,7 @@ PdfObject, RectangleObject, StreamObject, + is_null_or_none, ) try: @@ -101,7 +102,7 @@ def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleOb retval: Union[None, RectangleObject, IndirectObject] = self.get(name) if isinstance(retval, RectangleObject): return retval - if retval is None: + if is_null_or_none(retval): for d in defaults: retval = self.get(d) if retval is not None: @@ -492,7 +493,8 @@ def __init__( self.inline_images: Optional[Dict[str, ImageFile]] = None # below Union for mypy but actually Optional[List[str]] self.indirect_reference = indirect_reference - if indirect_reference is not None: + if not is_null_or_none(indirect_reference): + assert indirect_reference is not None, "mypy" self.update(cast(DictionaryObject, indirect_reference.get_object())) def hash_bin(self) -> int: @@ -731,9 +733,10 @@ def _get_inline_images(self) -> Dict[str, ImageFile]: entries will be identified as ~1~ """ content = self.get_contents() - if content is None: + if is_null_or_none(content): return {} imgs_data = [] + assert content is not None, "mypy" for param, ope in content.operations: if ope == b"INLINE IMAGE": imgs_data.append( @@ -1063,7 +1066,7 @@ def replace_contents( for i in range(len(content)): content[i] = self.indirect_reference.pdf._add_object(content[i]) - if content is None: + if is_null_or_none(content): if PG.CONTENTS not in self: return else: @@ -1084,6 +1087,7 @@ def replace_contents( # this will be fixed with the _add_object self[NameObject(PG.CONTENTS)] = content else: + assert content is not None, "mypy" content.indirect_reference = self[ PG.CONTENTS ].indirect_reference # TODO: in a future may required generation management @@ -2218,10 +2222,11 @@ def extract_text( if extraction_mode not in ["plain", "layout"]: raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") if extraction_mode == "layout": - for visitor in ("visitor_operand_before", - "visitor_operand_after", - "visitor_text", - ): + for visitor in ( + "visitor_operand_before", + "visitor_operand_after", + "visitor_text", + ): if locals()[visitor]: logger_warning( f"Argument {visitor} is ignored in layout mode", diff --git a/pypdf/_page_labels.py b/pypdf/_page_labels.py index b02527950..1bedc003a 100644 --- a/pypdf/_page_labels.py +++ b/pypdf/_page_labels.py @@ -62,7 +62,13 @@ from ._protocols import PdfCommonDocProtocol from ._utils import logger_warning -from .generic import ArrayObject, DictionaryObject, NullObject, NumberObject +from .generic import ( + ArrayObject, + DictionaryObject, + NullObject, + NumberObject, + is_null_or_none, +) def number2uppercase_roman_numeral(num: int) -> str: @@ -180,11 +186,13 @@ def index2label(reader: PdfCommonDocProtocol, index: int) -> str: # kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]} limits = cast(List[int], kid["/Limits"]) if limits[0] <= index <= limits[1]: - if kid.get("/Kids", None) is not None: + if not is_null_or_none(kid.get("/Kids", None)): # Recursive definition. level += 1 if level == 100: # pragma: no cover - raise NotImplementedError("Too deep nesting is not supported.") + raise NotImplementedError( + "Too deep nesting is not supported." + ) number_tree = kid # Exit the inner `for` loop and continue at the next level with the # next iteration of the `while` loop. @@ -195,10 +203,7 @@ def index2label(reader: PdfCommonDocProtocol, index: int) -> str: # and continue with the fallback. break - logger_warning( - f"Could not reliably determine page label for {index}.", - __name__ - ) + logger_warning(f"Could not reliably determine page label for {index}.", __name__) return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 58c160302..9948cbea3 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -79,6 +79,7 @@ PdfObject, StreamObject, TextStringObject, + is_null_or_none, read_object, ) from .xmp import XmpInformation @@ -206,11 +207,11 @@ def _info(self) -> Optional[DictionaryObject]: /Info Dictionary; None if the entry does not exist """ info = self.trailer.get(TK.INFO, None) - if info is None: + if is_null_or_none(info): return None else: info = info.get_object() - if info is None: + if info == None: # noqa: E711 raise PdfReadError( "Trailer not found or does not point to document information directory" ) @@ -225,7 +226,7 @@ def _ID(self) -> Optional[ArrayObject]: /ID array; None if the entry does not exist """ id = self.trailer.get(TK.ID, None) - return None if id is None else cast(ArrayObject, id.get_object()) + return None if is_null_or_none(id) else cast(ArrayObject, id.get_object()) def _repr_mimebundle_( self, @@ -298,8 +299,9 @@ def _get_page_number_by_indirect( x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore } - if indirect_reference is None or isinstance(indirect_reference, NullObject): + if is_null_or_none(indirect_reference): return None + assert isinstance(indirect_reference, (int, IndirectObject)), "mypy" if isinstance(indirect_reference, int): idnum = indirect_reference else: diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 1e6cb9e26..4d4cca329 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -107,6 +107,7 @@ ViewerPreferences, create_string_object, hex_to_rgb, + is_null_or_none, ) from .pagerange import PageRange, PageRangeSpec from .types import ( @@ -499,7 +500,7 @@ def _add_page( cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference) self.flattened_pages.append(page) cpt = 1000 - while node is not None: + while not is_null_or_none(node): node = cast(DictionaryObject, node.get_object()) node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1) node = node.get(PA.PARENT, None) @@ -612,8 +613,9 @@ def _get_page_number_by_indirect( The page number or None """ # to provide same function as in PdfReader - if indirect_reference is None or isinstance(indirect_reference, NullObject): + if is_null_or_none(indirect_reference): return None + assert indirect_reference is not None, "mypy" if isinstance(indirect_reference, int): indirect_reference = IndirectObject(indirect_reference, 0, self) obj = indirect_reference.get_object() @@ -928,7 +930,7 @@ def _update_field_annotation( ) dr = dr.get_object().get("/Font", DictionaryObject()).get_object() font_res = dr.get(font_name, None) - if font_res is not None: + if not is_null_or_none(font_res): font_res = cast(DictionaryObject, font_res.get_object()) font_subtype, _, font_encoding, font_map = build_char_map_from_dict( 200, font_res @@ -1566,9 +1568,9 @@ def metadata(self) -> Optional[DocumentInformation]: Retrieve/set the PDF file's document information dictionary, if it exists. Args: - value: Dictionary with the entries to set. If None, remove the /Info entry from the PDF. + value: dict with the entries to be set. if None : remove the /Info entry from the pdf. - Note that some PDF files use (XMP) metadata streams instead of document + Note that some PDF files use (xmp)metadata streams instead of document information dictionaries, and these metadata streams will not be accessed by this function. """ @@ -2981,7 +2983,7 @@ def _get_filtered_outline( if node is None: node = NullObject() node = node.get_object() - if node is None or isinstance(node, NullObject): + if is_null_or_none(node): node = DictionaryObject() if node.get("/Type", "") == "/Outlines" or "/Title" not in node: node = node.get("/First", None) diff --git a/pypdf/filters.py b/pypdf/filters.py index 7589c8051..e2fdd0d8c 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -746,9 +746,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, ) # for error reporting - if ( - hasattr(x_object_obj, "indirect_reference") and x_object_obj is None - ): # pragma: no cover + if x_object_obj is None: # pragma: no cover obj_as_text = x_object_obj.indirect_reference.__repr__() else: obj_as_text = x_object_obj.__repr__() diff --git a/pypdf/generic/__init__.py b/pypdf/generic/__init__.py index 63ccf1bdc..d9b0ea488 100644 --- a/pypdf/generic/__init__.py +++ b/pypdf/generic/__init__.py @@ -46,6 +46,7 @@ PdfObject, TextStringObject, encode_pdfdocencoding, + is_null_or_none, ) from ._data_structures import ( ArrayObject, @@ -235,6 +236,7 @@ def link( "encode_pdfdocencoding", "decode_pdfdocencoding", "hex_to_rgb", + "is_null_or_none", "read_hex_string_from_stream", "read_string_from_stream", ] diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index d02a79810..fd7d1a8ff 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -214,6 +214,16 @@ def __repr__(self) -> str: return "NullObject" +def is_null_or_none(x: Any) -> bool: + """ + Returns: + True if x is None or NullObject. + """ + return x is None or ( + isinstance(x, PdfObject) and isinstance(x.get_object(), NullObject) + ) + + class BooleanObject(PdfObject): def __init__(self, value: Any) -> None: self.value = value diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 215f2c75e..cc4b4a032 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -79,6 +79,7 @@ NumberObject, PdfObject, TextStringObject, + is_null_or_none, ) from ._fit import Fit from ._image_inline import ( @@ -451,7 +452,7 @@ def xmp_metadata(self) -> Optional[XmpInformationProtocol]: from ..xmp import XmpInformation metadata = self.get("/Metadata", None) - if metadata is None: + if is_null_or_none(metadata): return None metadata = metadata.get_object() @@ -651,7 +652,7 @@ def children(self) -> Iterable[Any]: if child == self[NameObject("/Last")]: return child_ref = child.get(NameObject("/Next")) # type: ignore - if child_ref is None: + if is_null_or_none(child_ref): return child = child_ref.get_object() @@ -661,8 +662,9 @@ def add_child(self, child: Any, pdf: PdfWriterProtocol) -> None: def inc_parent_counter_default( self, parent: Union[None, IndirectObject, "TreeObject"], n: int ) -> None: - if parent is None: + if is_null_or_none(parent): return + assert parent is not None, "mypy" parent = cast("TreeObject", parent.get_object()) if "/Count" in parent: parent[NameObject("/Count")] = NumberObject( @@ -673,8 +675,9 @@ def inc_parent_counter_default( def inc_parent_counter_outline( self, parent: Union[None, IndirectObject, "TreeObject"], n: int ) -> None: - if parent is None: + if is_null_or_none(parent): return + assert parent is not None, "mypy" parent = cast("TreeObject", parent.get_object()) # BooleanObject requires comparison with == not is opn = parent.get("/%is_open%", True) == True # noqa diff --git a/pypdf/generic/_fit.py b/pypdf/generic/_fit.py index 4132f4b71..c44d12b4c 100644 --- a/pypdf/generic/_fit.py +++ b/pypdf/generic/_fit.py @@ -1,5 +1,7 @@ from typing import Any, Optional, Tuple, Union +from ._base import is_null_or_none + class Fit: def __init__( @@ -9,8 +11,7 @@ def __init__( self.fit_type = NameObject(fit_type) self.fit_args = [ - NullObject() if a is None or isinstance(a, NullObject) else FloatObject(a) - for a in fit_args + NullObject() if is_null_or_none(a) else FloatObject(a) for a in fit_args ] @classmethod diff --git a/pypdf/generic/_viewerpref.py b/pypdf/generic/_viewerpref.py index a12f2d446..72f89d9ae 100644 --- a/pypdf/generic/_viewerpref.py +++ b/pypdf/generic/_viewerpref.py @@ -32,7 +32,7 @@ Optional, ) -from ._base import BooleanObject, NameObject, NumberObject +from ._base import BooleanObject, NameObject, NumberObject, is_null_or_none from ._data_structures import ArrayObject, DictionaryObject f_obj = BooleanObject(False) @@ -156,8 +156,8 @@ def _add_prop_int(key: str, deft: Optional[int]) -> property: def __init__(self, obj: Optional[DictionaryObject] = None) -> None: super().__init__(self) - if obj is not None: - self.update(obj.items()) + if not is_null_or_none(obj): + self.update(obj.items()) # type: ignore try: self.indirect_reference = obj.indirect_reference # type: ignore except AttributeError: diff --git a/tests/test_generic.py b/tests/test_generic.py index a13aa7b09..d5fad26d7 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -33,6 +33,7 @@ TreeObject, create_string_object, encode_pdfdocencoding, + is_null_or_none, read_hex_string_from_stream, read_object, read_string_from_stream, @@ -1139,3 +1140,18 @@ def test_missing_hashbin(): assert NullObject().hash_bin() == hash((NullObject,)) t = ByteStringObject(b"123") assert t.hash_bin() == hash((ByteStringObject, b"123")) + + +def test_is_null_or_none(): + assert is_null_or_none(NullObject()) + assert not is_null_or_none(PdfObject()) + + reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") + # used with get + assert is_null_or_none(reader.root_object.get("/do_no_exist")) + # object unknown... + assert is_null_or_none(IndirectObject(99999, 0, reader).get_object()) + # ... or which has been replaced with NullObject + writer = PdfWriter(reader) + writer.pages[0]["/Contents"].append(writer._add_object(NullObject())) + assert is_null_or_none(writer.pages[0]["/Contents"][-1])